diff --git a/README.md b/README.md index 8999b97..8a50521 100644 --- a/README.md +++ b/README.md @@ -4,24 +4,101 @@ Inverse Reinforcement Learning Algorithm implementation with python. -Implemented Algorithms: -- Maximum Entropy IRL: [1] -- Discrete Maximum Entropy Deep IRL: [2, 3] -- IQ-Learn +# Implemented Algorithms -Experiment: -- Mountaincar: [gym](https://www.gymlibrary.dev/environments/classic_control/mountain_car/) +## Maximum Entropy IRL: [1] -The implementation of MaxEntropyIRL and MountainCar is based on the implementation of: -[lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent) +## Maximum Entropy Deep IRL -# References +# Experiments -[1] [BD. Ziebart, et al., "Maximum Entropy Inverse Reinforcement Learning", AAAI 2008](https://cdn.aaai.org/AAAI/2008/AAAI08-227.pdf). +## Mountaincar-v0 +[gym](https://www.gymlibrary.dev/environments/classic_control/mountain_car/) + +The expert demonstrations for the Mountaincar-v0 are the same as used in [lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent). + +*Heatmap of Expert demonstrations with 400 states*: + + + +### Maximum Entropy Inverse Reinforcement Learning + +IRL using Q-Learning with a Maximum Entropy update function. + +#### Training + +*Learner training for 29000 episodes*: + + + +#### Heatmaps + +*Learner state frequencies after 1000 episodes*: + + + +*Learner state frequencies after 29000 episodes*: + + + +*State rewards heatmap after 1000 episodes*: + + + +*State rewards heatmap after 29000 episodes*: + + + +#### Testing + +*Testing results of the model after 29000 episodes*: + + -[2] [Wulfmeier, et al., "Maximum entropy deep inverse reinforcement learning." arXiv preprint arXiv:1507.04888 (2015).](https://arxiv.org/abs/1507.04888) -[3] [Xi-liang Chen, et al., "A Study of Continuous Maximum Entropy Deep Inverse Reinforcement Learning", Mathematical Problems in Engineering, vol. 2019, Article ID 4834516, 8 pages, 2019. https://doi.org/10.1155/2019/4834516](https://www.hindawi.com/journals/mpe/2019/4834516/) +### Deep Maximum Entropy Inverse Reinforcement Learning + +IRL using Deep Q-Learning with a Maximum Entropy update function. + +#### Training + +*Learner training for 29000 episodes*: + + + +#### Heatmaps + +*Learner state frequencies after 1000 episodes*: + + + +*Learner state frequencies after 29000 episodes*: + + + +*State rewards heatmap after 1000 episodes*: + + + +*State rewards heatmap after 29000 episodes*: + + + +#### Testing + +*Testing results of the model after 29000 episodes*: + + + +### Deep Maximum Entropy Inverse Reinforcement Learning with Critic + +Coming soon... + +# References +The implementation of MaxEntropyIRL and MountainCar is based on the implementation of: +[lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent) + +[1] [BD. Ziebart, et al., "Maximum Entropy Inverse Reinforcement Learning", AAAI 2008](https://cdn.aaai.org/AAAI/2008/AAAI08-227.pdf). # Installation @@ -38,7 +115,7 @@ usage: irl [-h] [--version] [--training] [--testing] [--render] ALGORITHM Implementation of IRL algorithms positional arguments: - ALGORITHM Currently supported training algorithm: [max-entropy, discrete-max-entropy-deep] + ALGORITHM Currently supported training algorithm: [max-entropy, max-entropy-deep] options: -h, --help show this help message and exit diff --git a/demo/expert_demo/expert_demo_mountaincar.npy b/demo/expert_demo/expert_demo_mountaincar.npy new file mode 100644 index 0000000..9614e5b Binary files /dev/null and b/demo/expert_demo/expert_demo_mountaincar.npy differ diff --git a/demo/heatmaps/expert_state_frequencies_mountaincar.png b/demo/heatmaps/expert_state_frequencies_mountaincar.png new file mode 100644 index 0000000..f7947b5 Binary files /dev/null and b/demo/heatmaps/expert_state_frequencies_mountaincar.png differ diff --git a/demo/heatmaps/leaner_maxent_29000_episodes.png b/demo/heatmaps/leaner_maxent_29000_episodes.png new file mode 100644 index 0000000..5157a67 Binary files /dev/null and b/demo/heatmaps/leaner_maxent_29000_episodes.png differ diff --git a/demo/heatmaps/learner_maxent_1000_episodes.png b/demo/heatmaps/learner_maxent_1000_episodes.png new file mode 100644 index 0000000..0ce1594 Binary files /dev/null and b/demo/heatmaps/learner_maxent_1000_episodes.png differ diff --git a/demo/heatmaps/learner_maxent_15000_episodes.png b/demo/heatmaps/learner_maxent_15000_episodes.png new file mode 100644 index 0000000..bc6de44 Binary files /dev/null and b/demo/heatmaps/learner_maxent_15000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_10000_episodes.png b/demo/heatmaps/learner_maxentropydeep_10000_episodes.png new file mode 100644 index 0000000..51b15af Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_10000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_1000_episodes.png b/demo/heatmaps/learner_maxentropydeep_1000_episodes.png new file mode 100644 index 0000000..6542af4 Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_1000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_15000_episodes.png b/demo/heatmaps/learner_maxentropydeep_15000_episodes.png new file mode 100644 index 0000000..2c7e28e Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_15000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_20000_episodes.png b/demo/heatmaps/learner_maxentropydeep_20000_episodes.png new file mode 100644 index 0000000..d6df196 Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_20000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_25000_episodes.png b/demo/heatmaps/learner_maxentropydeep_25000_episodes.png new file mode 100644 index 0000000..b257041 Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_25000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_29000_episodes.png b/demo/heatmaps/learner_maxentropydeep_29000_episodes.png new file mode 100644 index 0000000..5bc4dc9 Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_29000_episodes.png differ diff --git a/demo/heatmaps/learner_maxentropydeep_5000_episodes.png b/demo/heatmaps/learner_maxentropydeep_5000_episodes.png new file mode 100644 index 0000000..3f9f4ec Binary files /dev/null and b/demo/heatmaps/learner_maxentropydeep_5000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxent_1000_episodes.png b/demo/heatmaps/rewards_maxent_1000_episodes.png new file mode 100644 index 0000000..212f636 Binary files /dev/null and b/demo/heatmaps/rewards_maxent_1000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxent_15000_episodes.png b/demo/heatmaps/rewards_maxent_15000_episodes.png new file mode 100644 index 0000000..963cc9c Binary files /dev/null and b/demo/heatmaps/rewards_maxent_15000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxent_29000_episodes.png b/demo/heatmaps/rewards_maxent_29000_episodes.png new file mode 100644 index 0000000..d68c1a7 Binary files /dev/null and b/demo/heatmaps/rewards_maxent_29000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxentropydeep_10000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_10000_episodes.png new file mode 100644 index 0000000..9a3e9dd Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_10000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxentropydeep_1000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_1000_episodes.png new file mode 100644 index 0000000..3ff7728 Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_1000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxentropydeep_15000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_15000_episodes.png new file mode 100644 index 0000000..d295964 Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_15000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxentropydeep_20000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_20000_episodes.png new file mode 100644 index 0000000..b3b12ab Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_20000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxentropydeep_25000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_25000_episodes.png new file mode 100644 index 0000000..f654887 Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_25000_episodes.png differ diff --git a/demo/heatmaps/rewards_maxentropydeep_29000_episodes.png b/demo/heatmaps/rewards_maxentropydeep_29000_episodes.png new file mode 100644 index 0000000..f654887 Binary files /dev/null and b/demo/heatmaps/rewards_maxentropydeep_29000_episodes.png differ diff --git a/demo/learning_curves/leaner_maxent_29000_episodes.png b/demo/learning_curves/leaner_maxent_29000_episodes.png new file mode 100644 index 0000000..549c68d Binary files /dev/null and b/demo/learning_curves/leaner_maxent_29000_episodes.png differ diff --git a/demo/learning_curves/learner_maxentropy_deep_29000_episodes.png b/demo/learning_curves/learner_maxentropy_deep_29000_episodes.png new file mode 100644 index 0000000..041c897 Binary files /dev/null and b/demo/learning_curves/learner_maxentropy_deep_29000_episodes.png differ diff --git a/demo/test_results/test_maxent_29000_episodes.png b/demo/test_results/test_maxent_29000_episodes.png new file mode 100644 index 0000000..7a9a177 Binary files /dev/null and b/demo/test_results/test_maxent_29000_episodes.png differ diff --git a/demo/test_results/test_maxentropydeep_29000_episodes_model_results.png b/demo/test_results/test_maxentropydeep_29000_episodes_model_results.png new file mode 100644 index 0000000..9546bcb Binary files /dev/null and b/demo/test_results/test_maxentropydeep_29000_episodes_model_results.png differ diff --git a/demo/test_results/test_maxentropydeep_best_model_results.png b/demo/test_results/test_maxentropydeep_best_model_results.png new file mode 100644 index 0000000..dab163d Binary files /dev/null and b/demo/test_results/test_maxentropydeep_best_model_results.png differ diff --git a/demo/trained_models/model_maxentropydeep_29000_episodes_model.pth b/demo/trained_models/model_maxentropydeep_29000_episodes_model.pth new file mode 100644 index 0000000..05a344a Binary files /dev/null and b/demo/trained_models/model_maxentropydeep_29000_episodes_model.pth differ diff --git a/demo/trained_models/model_maxentropydeep_best_model.pth b/demo/trained_models/model_maxentropydeep_best_model.pth new file mode 100644 index 0000000..87c251a Binary files /dev/null and b/demo/trained_models/model_maxentropydeep_best_model.pth differ diff --git a/demo/trained_models/qtable_maxentropy_30000_episodes.npy b/demo/trained_models/qtable_maxentropy_30000_episodes.npy new file mode 100644 index 0000000..0dfdea8 Binary files /dev/null and b/demo/trained_models/qtable_maxentropy_30000_episodes.npy differ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/irlwpython/ContinuousMaxEntropyDeepIRL.py b/src/irlwpython/ContinuousMaxEntropyDeepIRL.py deleted file mode 100644 index 046d68f..0000000 --- a/src/irlwpython/ContinuousMaxEntropyDeepIRL.py +++ /dev/null @@ -1,193 +0,0 @@ -import gym -import numpy as np -import torch -import torch.optim as optim -import torch.nn as nn -import matplotlib.pyplot as plt - - -class ActorNetwork(nn.Module): - def __init__(self, num_inputs, num_output, hidden_size): - super(ActorNetwork, self).__init__() - self.fc1 = nn.Linear(num_inputs, hidden_size) - self.fc2 = nn.Linear(hidden_size, hidden_size) - self.fc3 = nn.Linear(hidden_size, num_output) - - def forward(self, x): - x = nn.functional.relu(self.fc1(x)) - x = nn.functional.relu(self.fc2(x)) - return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x)) - - -class CriticNetwork(nn.Module): - def __init__(self, num_inputs, hidden_size): - super(CriticNetwork, self).__init__() - self.fc1 = nn.Linear(num_inputs, hidden_size) - self.fc2 = nn.Linear(hidden_size, hidden_size) - self.fc3 = nn.Linear(hidden_size, 1) - - self.theta_layer = nn.Linear(hidden_size, 3) - - def forward(self, x): - x_ = nn.functional.relu(self.fc1(x)) - x_ = nn.functional.relu(self.fc2(x_)) - theta_ = self.theta_layer(x_) - return self.fc3(x_) + torch.matmul(theta_, x) - - -class MaxEntropyDeepIRL: - def __init__(self, target, state_dim, action_dim, learning_rate=0.001, gamma=0.99, num_epochs=1000): - self.target = target - self.state_dim = state_dim - self.action_dim = action_dim - self.learning_rate = learning_rate - # self.theta = torch.rand(state_dim + 1, requires_grad=True) - self.gamma = gamma - self.num_epochs = num_epochs - self.actor_network = ActorNetwork(state_dim, action_dim, 100) - self.critic_network = CriticNetwork(state_dim + 1, 100) - self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate) - self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate) - - def get_reward(self, state, action): - state_action = list(state) + list([action]) - state_action = torch.Tensor(state_action) - return self.critic_network(state_action) - - def expert_feature_expectations(self, demonstrations): - feature_expectations = torch.zeros(self.state_dim) - - for demonstration in demonstrations: - for state, _, _ in demonstration: - state_tensor = torch.tensor(state, dtype=torch.float32) - feature_expectations += state_tensor.squeeze() - - feature_expectations /= demonstrations.shape[0] - return feature_expectations - - def maxent_irl(self, expert, learner): - # Update critic network - - self.optimizer_critic.zero_grad() - - # Loss function for critic network - loss_critic = torch.nn.functional.mse_loss(learner, expert) - loss_critic.backward() - - self.optimizer_critic.step() - - def update_q_network(self, state_array, action, reward, next_state): - self.optimizer_actor.zero_grad() - - state_tensor = torch.tensor(state_array, dtype=torch.float32) - next_state_tensor = torch.tensor(next_state, dtype=torch.float32) - - q_values = self.actor_network(state_tensor) - # q_1 = self.actor_network(state_tensor)[action] - # q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor)) - next_q_values = reward + self.gamma * self.actor_network(next_state_tensor) - - loss_actor = nn.functional.mse_loss(q_values, next_q_values) - loss_actor.backward() - self.optimizer_actor.step() - - def get_demonstrations(self): - env_low = self.target.observation_space.low - env_high = self.target.observation_space.high - env_distance = (env_high - env_low) / 20 # self.one_feature - - raw_demo = np.load(file="expert_demo/expert_demo.npy") - demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3)) - for x in range(len(raw_demo)): - for y in range(len(raw_demo[0])): - position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0]) - velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1]) - state_idx = position_idx + velocity_idx * 20 # self.one_feature - - demonstrations[x][y][0] = state_idx - demonstrations[x][y][1] = raw_demo[x][y][2] - - print(demonstrations) - return demonstrations - - def get_expert_state_frequencies(self): - raw_demo = np.load(file="expert_demo/expert_demo.npy") - expert_state_frequencies = [] - return expert_state_frequencies - - def train(self): - demonstrations = self.get_demonstrations() - expert = self.expert_feature_expectations(demonstrations) - - expert_state_frequencies = self.get_expert_state_frequencies() - - learner_feature_expectations = torch.zeros(self.state_dim, requires_grad=True) # Add requires_grad=True - episodes, scores = [], [] - - for episode in range(self.num_epochs): - state, info = self.target.reset() - score = 0 - - if (episode != 0 and episode == 10) or (episode > 10 and episode % 5 == 0): - learner = learner_feature_expectations / episode - self.maxent_irl(expert, learner) - - while True: - state_tensor = torch.tensor(state, dtype=torch.float32) - - q_state = self.actor_network(state_tensor) - action = torch.argmax(q_state).item() - next_state, reward, done, _, _ = self.target.step(action) - - irl_reward = self.get_reward(state, action) - self.update_q_network(state, action, irl_reward, next_state) - - print("Q Actor Network", state, q_state) - print("Reward", reward, "IRL Reward", irl_reward) - - learner_feature_expectations = learner_feature_expectations + state_tensor.squeeze() - - print(expert) - print(learner_feature_expectations) - - score += reward - state = next_state - if done: - scores.append(score) - episodes.append(episode) - break - - if episode % 1 == 0: - score_avg = np.mean(scores) - print('{} episode score is {:.2f}'.format(episode, score_avg)) - plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/maxent_30000_network.png") - - torch.save(self.q_network.state_dict(), "./results/maxent_30000_q_network.pth") - - def test(self): - episodes, scores = [], [] - - for episode in range(10): - state = self.target.reset() - score = 0 - - while True: - self.target.render() - state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) - - action = torch.argmax(self.q_network(state_tensor)).item() - next_state, reward, done, _, _ = self.target.step(action) - - score += reward - state = next_state - - if done: - scores.append(score) - episodes.append(episode) - plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/maxent_test_30000_network.png") - break - - if episode % 1 == 0: - print('{} episode score is {:.2f}'.format(episode, score)) diff --git a/src/irlwpython/DiscreteMaxEntropyDeepIRL.py b/src/irlwpython/DiscreteMaxEntropyDeepIRL.py deleted file mode 100644 index 66bf723..0000000 --- a/src/irlwpython/DiscreteMaxEntropyDeepIRL.py +++ /dev/null @@ -1,171 +0,0 @@ -import gym -import numpy as np -import torch -import torch.optim as optim -import torch.nn as nn -import matplotlib.pyplot as plt - - -class ActorNetwork(nn.Module): - def __init__(self, num_inputs, num_output, hidden_size): - super(ActorNetwork, self).__init__() - self.fc1 = nn.Linear(num_inputs, hidden_size) - self.fc2 = nn.Linear(hidden_size, hidden_size) - self.fc3 = nn.Linear(hidden_size, num_output) - - def forward(self, x): - x = nn.functional.relu(self.fc1(x)) - x = nn.functional.relu(self.fc2(x)) - return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x)) - - -class CriticNetwork(nn.Module): - def __init__(self, num_inputs, hidden_size): - super(CriticNetwork, self).__init__() - self.fc1 = nn.Linear(num_inputs, hidden_size) - self.fc2 = nn.Linear(hidden_size, hidden_size) - self.fc3 = nn.Linear(hidden_size, 1) - - self.theta_layer = nn.Linear(hidden_size, 3) - - def forward(self, x): - x_ = nn.functional.relu(self.fc1(x)) - x_ = nn.functional.relu(self.fc2(x_)) - theta_ = self.theta_layer(x_) - return self.fc3(x_) + torch.matmul(theta_, x) - - -class DiscreteMaxEntropyDeepIRL: - def __init__(self, target, state_dim, action_dim, feature_matrix=None, learning_rate=0.001, gamma=0.99, - num_epochs=1000): - self.feat_matrix = feature_matrix - self.one_feature = 20 - - self.target = target - self.state_dim = state_dim - self.action_dim = action_dim - self.learning_rate = learning_rate - - self.gamma = gamma - self.num_epochs = num_epochs - self.actor_network = ActorNetwork(state_dim, action_dim, 100) - self.critic_network = CriticNetwork(state_dim + 1, 100) - self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate) - self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate) - - def get_reward(self, state, action): - state_action = list(state) + list([action]) - state_action = torch.Tensor(state_action) - return self.critic_network(state_action) - - def expert_feature_expectations(self, demonstrations): - feature_expectations = torch.zeros(400) - - for demonstration in demonstrations: - for state, _, _ in demonstration: - state_tensor = torch.tensor(state, dtype=torch.float32) - feature_expectations += state_tensor.squeeze() - - feature_expectations /= demonstrations.shape[0] - return feature_expectations - - def maxent_irl(self, expert, learner): - # Update critic network - - self.optimizer_critic.zero_grad() - - # Loss function for critic network - loss_critic = torch.nn.functional.mse_loss(learner, expert) - loss_critic.backward() - - self.optimizer_critic.step() - - def update_q_network(self, state_array, action, reward, next_state): - self.optimizer_actor.zero_grad() - - state_tensor = torch.tensor(state_array, dtype=torch.float32) - next_state_tensor = torch.tensor(next_state, dtype=torch.float32) - - q_values = self.actor_network(state_tensor) - q_1 = self.actor_network(state_tensor)[action] - - q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor)) - next_q_values = reward + self.gamma * (q_2 - q_1) # self.actor_network(next_state_tensor) - - loss_actor = nn.functional.mse_loss(q_values, next_q_values) - loss_actor.backward() - self.optimizer_actor.step() - - def train(self): - demonstrations = self.target.get_demonstrations() - expert = self.expert_feature_expectations(demonstrations) - - learner_feature_expectations = torch.zeros(400, requires_grad=True) - episodes, scores = [], [] - - for episode in range(self.num_epochs): - state, info = self.target.env_reset() - score = 0 - - while True: - state_tensor = torch.tensor(state, dtype=torch.float32) - - q_state = self.actor_network(state_tensor) - action = torch.argmax(q_state).item() - next_state, reward, done, _, _ = self.target.env_step(action) - - # Actor update - irl_reward = self.get_reward(state, action) - self.update_q_network(state, action, irl_reward, next_state) - - score += reward - state = next_state - if done: - scores.append(score) - episodes.append(episode) - break - - # Critic update - state_idx = state[0] + state[1] * self.one_feature - learner_feature_expectations = learner_feature_expectations + torch.Tensor( - self.feat_matrix[int(state_idx)]) - learner = learner_feature_expectations / episode - self.maxent_irl(expert, learner) - - if episode % 1 == 0: - score_avg = np.mean(scores) - print('{} episode score is {:.2f}'.format(episode, score_avg)) - plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/discretemaxentdeep_30000.png") - - torch.save(self.actor_network.state_dict(), "./results/discretemaxentdeep_30000_actor.pth") - torch.save(self.critic_network.state_dict(), "./results/discretemaxentdeep_30000_critic.pth") - - def test(self): - assert 1 == 0 # TODO: not implemented yet - - episodes, scores = [], [] - - for episode in range(10): - state = self.target.env_reset() - score = 0 - - while True: - self.target.env_render() - state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) - - action = torch.argmax(self.actor_network(state_tensor)).item() - next_state, reward, done, _, _ = self.target.env_step(action) - - score += reward - state = next_state - - if done: - scores.append(score) - episodes.append(episode) - plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/discretemaxentdeep_test_30000.png") - break - - if episode % 1 == 0: - print('{} episode score is {:.2f}'.format(episode, score)) diff --git a/src/irlwpython/GenerateDemonstrationsMountainCar.py b/src/irlwpython/GenerateDemonstrationsMountainCar.py new file mode 100644 index 0000000..3e37c0c --- /dev/null +++ b/src/irlwpython/GenerateDemonstrationsMountainCar.py @@ -0,0 +1,44 @@ +import gym +import readchar +import numpy as np + +# # MACROS +Push_Left = 0 +No_Push = 1 +Push_Right = 2 + +# Key mapping +arrow_keys = { + '\x1b[D': Push_Left, + '\x1b[B': No_Push, + '\x1b[C': Push_Right} + +env = gym.make('MountainCar-v0')#, render_mode="human") + +trajectories = [] +episode_step = 0 + +for episode in range(1): # n_trajectories : 20 + trajectory = [] + step = 0 + + env.reset() + print("episode_step", episode_step) + + while True: + env.render() + print("step", step) + + key = readchar.readkey() + if key not in arrow_keys.keys(): + break + + action = arrow_keys[key] + state, reward, done, _, _ = env.step(action) + + if state[0] >= env.env.goal_position and step > 129: # trajectory_length : 130 + break + + trajectory.append((state[0], state[1], action)) + step += 1 + print(trajectory) diff --git a/src/irlwpython/MaxEntropyDeep.py b/src/irlwpython/MaxEntropyDeep.py new file mode 100644 index 0000000..5873f69 --- /dev/null +++ b/src/irlwpython/MaxEntropyDeep.py @@ -0,0 +1,211 @@ +import numpy as np +import torch +import torch.optim as optim +import torch.nn as nn +import matplotlib.pyplot as plt +import PIL + + +class QNetwork(nn.Module): + def __init__(self, input_size, output_size): + super(QNetwork, self).__init__() + self.fc1 = nn.Linear(input_size, 128) + self.relu1 = nn.ReLU() + # self.fc2 = nn.Linear(128, 128) + # self.relu2 = nn.ReLU() + self.output_layer = nn.Linear(128, output_size) + + def forward(self, state): + x = self.fc1(state) + x = self.relu1(x) + # x = self.fc2(x) + # x = self.relu2(x) + q_values = self.output_layer(x) + return q_values + + +class MaxEntropyDeepIRL: + def __init__(self, target, state_dim, action_size, feature_matrix=None, one_feature=None, theta=None, + learning_rate=0.001, gamma=0.99): + self.feature_matrix = feature_matrix + self.one_feature = one_feature + + self.target = target # Environment + + self.q_network = QNetwork(state_dim, action_size) + self.target_q_network = QNetwork(state_dim, action_size) + self.target_q_network.load_state_dict(self.q_network.state_dict()) + self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate) + + self.gamma = gamma + + self.theta_learning_rate = 0.05 + self.theta = theta + + def select_action(self, state, epsilon): + if np.random.rand() < epsilon: + return np.random.choice(3) + else: + with torch.no_grad(): + q_values = self.q_network(torch.FloatTensor(state)) + return torch.argmax(q_values).item() + + def get_reward(self, n_states, state_idx): + """ + Returns the achieved reward. + :param n_states: + :param state_idx: + :return: + """ + irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,)) + return irl_rewards[state_idx] + + def expert_feature_expectations(self, demonstrations): + feature_expectations = np.zeros(self.feature_matrix.shape[0]) + + for demonstration in demonstrations: + for state_idx, _, _ in demonstration: + feature_expectations += self.feature_matrix[int(state_idx)] + + feature_expectations /= demonstrations.shape[0] + return feature_expectations + + def maxent_irl(self, expert, learner): + """ + Max Entropy Learning step. + :param expert: + :param learner: + :param learning_rate: + :return: + """ + gradient = expert - learner + self.theta += self.theta_learning_rate * gradient + + # Clip theta + for j in range(len(self.theta)): + if self.theta[j] > 0: # log values + self.theta[j] = 0 + + def update_q_network(self, state, action, reward, next_state, done): + state = torch.FloatTensor(state) + next_state = torch.FloatTensor(next_state) + q_values = self.q_network(state) + next_q_values = self.target_q_network(next_state) + + target = q_values.clone() + if not done: + target[action] = reward + self.gamma * torch.max(next_q_values).item() + else: + target[action] = reward + + loss = nn.MSELoss()(q_values, target.detach()) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + def update_target_network(self): + self.target_q_network.load_state_dict(self.q_network.state_dict()) + + def train(self, n_states, episodes=30000, max_steps=200, + epsilon_start=1.0, + epsilon_decay=0.995, epsilon_min=0.01): + demonstrations = self.target.get_demonstrations() + expert = self.expert_feature_expectations(demonstrations) + plt.imshow(expert.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig("src/irlwpython/heatmap/expert_deep.png") + + learner_feature_expectations = np.zeros(n_states) + + epsilon = epsilon_start + episode_arr, scores = [], [] + + for episode in range(episodes): + state, info = self.target.env_reset() + total_reward = 0 + + # Mini-Batches: + if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0): + # calculate density + learner = learner_feature_expectations / episode + # Maximum Entropy IRL step + self.maxent_irl(expert, learner) + + for step in range(max_steps): + action = self.select_action(state, epsilon) + + next_state, reward, done, _, _ = self.target.env_step(action) + # Real Reward + total_reward += reward + + # IRL + state_idx = self.target.state_to_idx(state) + irl_reward = self.get_reward(n_states, state_idx) + + self.update_q_network(state, action, irl_reward, next_state, done) + self.update_target_network() + + # State counting for densitiy + learner_feature_expectations += self.feature_matrix[int(state_idx)] + + state = next_state + if done: + break + + scores.append(total_reward) + episode_arr.append(episode) + epsilon = max(epsilon * epsilon_decay, epsilon_min) + print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}") + + if episode % 1000 == 0 and episode != 0: + score_avg = np.mean(scores) + print('{} episode average score is {:.2f}'.format(episode, score_avg)) + plt.plot(episode_arr, scores, 'b') + learner = learner_feature_expectations / episode + plt.savefig(f"src/irlwpython/learning_curves/maxent_{episodes}_{episode}_qnetwork_class.png") + plt.imshow(learner.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"src/irlwpython/heatmap/learner_{episode}_deep_class.png") + plt.imshow(self.theta.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"src/irlwpython/heatmap/theta_{episode}_deep_class.png") + plt.imshow(self.feature_matrix.dot(self.theta).reshape((20, 20)), cmap='viridis', + interpolation='nearest') + plt.savefig(f"src/irlwpython/heatmap/rewards_{episode}_deep_class.png") + + torch.save(self.q_network.state_dict(), f"./results/maxent_{episodes}_{episode}_network_class.pth") + + if episode == episodes - 1: + plt.plot(episode_arr, scores, 'b') + plt.savefig(f"src/irlwpython/learning_curves/maxentdeep_{episodes}_qdeep_class.png") + + torch.save(self.q_network.state_dict(), f"src/irlwpython/results/maxentdeep_{episodes}_q_network_class.pth") + + def test(self, model_path, epsilon=0.01): + """ + Tests the previous trained model + :return: + """ + self.q_network.load_state_dict(torch.load(model_path)) + #self.q_network #.eval() + + episodes, scores = [], [] + + for episode in range(10): + state, info = self.target.env_reset() + score = 0 + + while True: + self.target.env_render() + action = self.select_action(state, epsilon) + next_state, reward, done, _, _ = self.target.env_step(action) + + score += reward + state = next_state + + if done: + scores.append(score) + episodes.append(episode) + plt.plot(episodes, scores, 'b') + plt.savefig("src/irlwpython/learning_curves/test_maxentropydeep_best_model_results.png") + break + + if episode % 1 == 0: + print('{} episode score is {:.2f}'.format(episode, score)) diff --git a/src/irlwpython/MaxEntropyIRL.py b/src/irlwpython/MaxEntropyIRL.py index f415bdd..b3117dc 100644 --- a/src/irlwpython/MaxEntropyIRL.py +++ b/src/irlwpython/MaxEntropyIRL.py @@ -1,11 +1,12 @@ # -# This file is hardly inspired by the IRL implementation of: +# This file is a refactored implementation of the Maximum Entropy IRL from: # https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent # It is a class type implementation restructured for our use case. # import numpy as np import matplotlib.pyplot as plt +import PIL class MaxEntropyIRL: @@ -64,7 +65,7 @@ def maxent_irl(self, expert, learner, learning_rate): # Clip theta for j in range(len(self.theta)): - if self.theta[j] > 0: # log values + if self.theta[j] > 0: # log values self.theta[j] = 0 def update_q_table(self, state, action, reward, next_state): @@ -80,7 +81,7 @@ def update_q_table(self, state, action, reward, next_state): q_2 = reward + self.gamma * max(self.q_table[next_state]) self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1) - def train(self, theta_learning_rate): + def train(self, theta_learning_rate, episode_count=30000): """ Trains a model. :param theta_learning_rate: @@ -95,7 +96,7 @@ def train(self, theta_learning_rate): learner_feature_expectations = np.zeros(self.n_states) episodes, scores = [], [] # For every episode - for episode in range(30000): + for episode in range(episode_count): # Resets the environment to an initial state and returns the initial observation. # Start position is in random range of [-0.6, -0.4] state = self.target.env_reset() @@ -111,7 +112,7 @@ def train(self, theta_learning_rate): # One Step in environment state = state[0] while True: - state_idx = self.target.idx_to_state(state) + state_idx = self.target.state_to_idx(state) action = np.argmax(self.q_table[state_idx]) # Run one timestep of the environment's dynamics. @@ -119,7 +120,7 @@ def train(self, theta_learning_rate): # get pseudo-reward and update q table irl_reward = self.get_reward(self.n_states, state_idx) - next_state_idx = self.target.idx_to_state(next_state) + next_state_idx = self.target.state_to_idx(next_state) self.update_q_table(state_idx, action, irl_reward, next_state_idx) # State counting for densitiy @@ -132,12 +133,20 @@ def train(self, theta_learning_rate): episodes.append(episode) break - if episode % 1000 == 0: + if episode % 1000 == 0 and episode != 0: score_avg = np.mean(scores) print('{} episode score is {:.2f}'.format(episode, score_avg)) plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/maxent_30000.png") - np.save("./results/maxent_30000_table", arr=self.q_table) + plt.savefig(f"src/irlwpython/learning_curves/maxent_{episode}_qtable.png") + np.save(f"src/irlwpython/results/maxent_{episode}_qtable", arr=self.q_table) + learner = learner_feature_expectations / episode + plt.imshow(learner.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"src/irlwpython/heatmap/learner_{episode}_qtable.png") + plt.imshow(self.theta.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"src/irlwpython/heatmap/theta_{episode}_qtable.png") + plt.imshow(self.feature_matrix.dot(self.theta).reshape((20, 20)), cmap='viridis', + interpolation='nearest') + plt.savefig(f"src/irlwpython/heatmap/rewards_{episode}_qtable.png") def test(self): """ @@ -153,7 +162,7 @@ def test(self): state = state[0] while True: self.target.env_render() - state_idx = self.target.idx_to_state(state) + state_idx = self.target.state_to_idx(state) action = np.argmax(self.q_table[state_idx]) next_state, reward, done, _, _ = self.target.env_step(action) @@ -164,8 +173,8 @@ def test(self): scores.append(score) episodes.append(episode) plt.plot(episodes, scores, 'b') - plt.savefig("./learning_curves/maxent_test_30000.png") + plt.savefig("src/irlwpython/learning_curves/maxent_test_30000_maxentropy.png") break if episode % 1 == 0: - print('{} episode score is {:.2f}'.format(episode, score)) \ No newline at end of file + print('{} episode score is {:.2f}'.format(episode, score)) diff --git a/src/irlwpython/MountainCar.py b/src/irlwpython/MountainCar.py index 981e426..2543615 100644 --- a/src/irlwpython/MountainCar.py +++ b/src/irlwpython/MountainCar.py @@ -1,5 +1,5 @@ # -# This file is hardly inspired by the IRL implementation of: +# This file is a refactored implementation of the environment form: # https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent # It is a class type implementation restructured for our use case. # @@ -27,7 +27,7 @@ def get_demonstrations(self): env_high = self.env.observation_space.high env_distance = (env_high - env_low) / self.one_feature - raw_demo = np.load(file="expert_demo/expert_demo.npy") + raw_demo = np.load(file="src/irlwpython/expert_demo/expert_demo.npy") demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3)) for x in range(len(raw_demo)): for y in range(len(raw_demo[0])): @@ -40,7 +40,7 @@ def get_demonstrations(self): return demonstrations - def idx_to_state(self, state): + def state_to_idx(self, state): """ Converts state (pos, vel) to the integer value using the mountain car environment. :param state: @@ -55,6 +55,14 @@ def idx_to_state(self, state): state_idx = position_idx + velocity_idx * self.one_feature return state_idx + def discretize_state(self, state): + env_low = self.env.observation_space.low + env_high = self.env.observation_space.high + env_distance = (env_high - env_low) / self.one_feature + position_idx = int((state[0] - env_low[0]) / env_distance[0]) + velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) + return [position_idx, velocity_idx] + def env_action_space(self): return self.env.action_space diff --git a/src/irlwpython/learning_curves/maxent_300.png b/src/irlwpython/learning_curves/maxent_300.png deleted file mode 100644 index c444b6b..0000000 Binary files a/src/irlwpython/learning_curves/maxent_300.png and /dev/null differ diff --git a/src/irlwpython/learning_curves/maxent_30000.png b/src/irlwpython/learning_curves/maxent_30000.png deleted file mode 100644 index 1ea3b08..0000000 Binary files a/src/irlwpython/learning_curves/maxent_30000.png and /dev/null differ diff --git a/src/irlwpython/learning_curves/maxent_30000_network.png b/src/irlwpython/learning_curves/maxent_30000_network.png deleted file mode 100644 index 273e9ae..0000000 Binary files a/src/irlwpython/learning_curves/maxent_30000_network.png and /dev/null differ diff --git a/src/irlwpython/learning_curves/maxent_test.png b/src/irlwpython/learning_curves/maxent_test.png deleted file mode 100644 index 57a2b2b..0000000 Binary files a/src/irlwpython/learning_curves/maxent_test.png and /dev/null differ diff --git a/src/irlwpython/learning_curves/maxent_test_300.png b/src/irlwpython/learning_curves/maxent_test_300.png deleted file mode 100644 index 9e53fcd..0000000 Binary files a/src/irlwpython/learning_curves/maxent_test_300.png and /dev/null differ diff --git a/src/irlwpython/learning_curves/maxent_test_30000.png b/src/irlwpython/learning_curves/maxent_test_30000.png deleted file mode 100644 index 5c84ed9..0000000 Binary files a/src/irlwpython/learning_curves/maxent_test_30000.png and /dev/null differ diff --git a/src/irlwpython/main.py b/src/irlwpython/main.py index a877472..4ce1b58 100644 --- a/src/irlwpython/main.py +++ b/src/irlwpython/main.py @@ -1,11 +1,12 @@ import argparse import logging + import numpy as np import sys +from irlwpython.MaxEntropyDeep import MaxEntropyDeepIRL from irlwpython.MountainCar import MountainCar from irlwpython.MaxEntropyIRL import MaxEntropyIRL -from irlwpython.DiscreteMaxEntropyDeepIRL import DiscreteMaxEntropyDeepIRL from irlwpython import __version__ @@ -15,9 +16,6 @@ _logger = logging.getLogger(__name__) -np.random.seed(1) - - def parse_args(args): """Parse command line parameters @@ -35,7 +33,7 @@ def parse_args(args): version=f"IRLwPython {__version__}", ) parser.add_argument('algorithm', metavar='ALGORITHM', type=str, - help='Currently supported training algorithm: [max-entropy, discrete-max-entropy-deep]') + help='Currently supported training algorithm: [max-entropy, max-entropy-deep]') parser.add_argument('--training', action='store_true', help="Enables training of model.") parser.add_argument('--testing', action='store_true', help="Enables testing of previously created model.") @@ -76,7 +74,7 @@ def main(args): gamma = 0.99 q_learning_rate = 0.03 - # Theta works as Critic + # Theta works as Rewards theta_learning_rate = 0.05 theta = -(np.random.uniform(size=(n_states,))) @@ -85,16 +83,14 @@ def main(args): else: car = MountainCar(False, one_feature) - if args.algorithm == "discrete-max-entropy-deep" and args.training: - state_dim = 2 - + if args.algorithm == "max-entropy-deep" and args.training: # Run MaxEnt Deep IRL using MountainCar environment - maxent_deep_irl_agent = DiscreteMaxEntropyDeepIRL(car, state_dim, n_actions, feature_matrix) - maxent_deep_irl_agent.train() - # maxent_deep_irl_agent.test() + trainer = MaxEntropyDeepIRL(car, 2, n_actions, feature_matrix, one_feature, theta) + trainer.train(400) - if args.algorithm == "discrete-max-entropy-deep" and args.testing: - pass + if args.algorithm == "max-entropy-deep" and args.testing: + trainer = MaxEntropyDeepIRL(car, 2, n_actions, feature_matrix, one_feature, theta) + trainer.test("demo/trained_models/model_maxentropydeep_best_model.pth") if args.algorithm == "max-entropy" and args.training: q_table = np.zeros((n_states, n_actions)) @@ -102,7 +98,7 @@ def main(args): trainer.train(theta_learning_rate) if args.algorithm == "max-entropy" and args.testing: - q_table = np.load(file="./results/maxent_q_table.npy") + q_table = np.load(file="demo/trained_models/qtable_maxentropy_30000_episodes.npy") trainer = MaxEntropyIRL(car, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta) trainer.test() diff --git a/src/irlwpython/results/maxent_30000_table.npy b/src/irlwpython/results/maxent_30000_table.npy deleted file mode 100644 index f7ffb99..0000000 Binary files a/src/irlwpython/results/maxent_30000_table.npy and /dev/null differ diff --git a/src/irlwpython/results/maxent_300_table.npy b/src/irlwpython/results/maxent_300_table.npy deleted file mode 100644 index 63b2f3c..0000000 Binary files a/src/irlwpython/results/maxent_300_table.npy and /dev/null differ diff --git a/src/irlwpython/results/maxent_q_table.npy b/src/irlwpython/results/maxent_q_table.npy deleted file mode 100644 index b2adff1..0000000 Binary files a/src/irlwpython/results/maxent_q_table.npy and /dev/null differ diff --git a/src/irlwpython/scripts/direct_train_deep_max_entropy.py b/src/irlwpython/scripts/direct_train_deep_max_entropy.py new file mode 100644 index 0000000..55fbf75 --- /dev/null +++ b/src/irlwpython/scripts/direct_train_deep_max_entropy.py @@ -0,0 +1,237 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import gym +import numpy as np +import matplotlib.pyplot as plt + +class QNetwork(nn.Module): + def __init__(self, input_size, output_size): + super(QNetwork, self).__init__() + self.fc1 = nn.Linear(input_size, 128) + self.relu1 = nn.ReLU() + # self.fc2 = nn.Linear(128, 128) + # self.relu2 = nn.ReLU() + self.output_layer = nn.Linear(128, output_size) + + def forward(self, state): + x = self.fc1(state) + x = self.relu1(x) + # x = self.fc2(x) + # x = self.relu2(x) + q_values = self.output_layer(x) + return q_values + + +# Define the DQN Agent +class DQNAgent: + def __init__(self, state_size, action_size, theta, feature_matrix, one_feature, learning_rate=0.001, gamma=0.99): + self.q_network = QNetwork(state_size, action_size) + self.target_q_network = QNetwork(state_size, action_size) + self.target_q_network.load_state_dict(self.q_network.state_dict()) + self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate) + self.gamma = gamma + + self.theta_learning_rate = 0.05 + self.theta = theta + self.feature_matrix = feature_matrix + self.one_feature = one_feature + + def select_action(self, state, epsilon): + if np.random.rand() < epsilon: + return np.random.choice(3) + else: + with torch.no_grad(): + q_values = self.q_network(torch.FloatTensor(state)) + return torch.argmax(q_values).item() + + def update_q_network(self, state, action, reward, next_state, done): + state = torch.FloatTensor(state) + next_state = torch.FloatTensor(next_state) + q_values = self.q_network(state) + next_q_values = self.target_q_network(next_state) + + target = q_values.clone() + if not done: + target[action] = reward + self.gamma * torch.max(next_q_values).item() + else: + target[action] = reward + + loss = nn.MSELoss()(q_values, target.detach()) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + def update_target_network(self): + self.target_q_network.load_state_dict(self.q_network.state_dict()) + + def state_to_idx(self, env, state): + """ + Converts state (pos, vel) to the integer value using the mountain car environment. + :param state: + :return: + """ + """ """ + env_low = env.observation_space.low + env_high = env.observation_space.high + env_distance = (env_high - env_low) / self.one_feature + position_idx = int((state[0] - env_low[0]) / env_distance[0]) + velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) + state_idx = position_idx + velocity_idx * self.one_feature + return state_idx + + def discretize_state(self, env, state): + env_low = env.observation_space.low + env_high = env.observation_space.high + env_distance = (env_high - env_low) / self.one_feature + position_idx = int((state[0] - env_low[0]) / env_distance[0]) + velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) + return [position_idx, velocity_idx] + + def get_demonstrations(self, env): + """ + Parses the demonstrations and returns the demonstrations. + :param one_feature: + :return: + """ + env_low = env.observation_space.low + env_high = env.observation_space.high + env_distance = (env_high - env_low) / self.one_feature + + raw_demo = np.load(file="../expert_demo/expert_demo.npy") + demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3)) + for x in range(len(raw_demo)): + for y in range(len(raw_demo[0])): + position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0]) + velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1]) + state_idx = position_idx + velocity_idx * self.one_feature + demonstrations[x][y][0] = state_idx + demonstrations[x][y][1] = raw_demo[x][y][2] + return demonstrations + + def expert_feature_expectations(self, demonstrations): + feature_expectations = np.zeros(self.feature_matrix.shape[0]) + + for demonstration in demonstrations: + for state_idx, _, _ in demonstration: + feature_expectations += self.feature_matrix[int(state_idx)] + + feature_expectations /= demonstrations.shape[0] + return feature_expectations + + def get_reward(self, n_states, state_idx): + """ + Returns the achieved reward. + :param n_states: + :param state_idx: + :return: + """ + irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,)) + return irl_rewards[state_idx] + + def maxent_irl(self, expert, learner): + """ + Max Entropy Learning step. + :param expert: + :param learner: + :param learning_rate: + :return: + """ + gradient = expert - learner + self.theta += self.theta_learning_rate * gradient + + print("Theta", self.theta) + + # Clip theta + for j in range(len(self.theta)): + if self.theta[j] > 0: # log values + self.theta[j] = 0 + + +# Training Loop +def train(agent, env, expert, learner_feature_expectations, n_states, episodes=30000, max_steps=10000, epsilon_start=1.0, + epsilon_decay=0.995, epsilon_min=0.01): + epsilon = epsilon_start + episode_arr, scores = [], [] + + for episode in range(episodes): + state, info = env.reset() + total_reward = 0 + + # Mini-Batches: + if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0): + # calculate density + learner = learner_feature_expectations / episode + # Maximum Entropy IRL step + agent.maxent_irl(expert, learner) + + for step in range(max_steps): + action = agent.select_action(state, epsilon) + + next_state, reward, done, _, _ = env.step(action) + # Real Reward + total_reward += reward + + # IRL + state_idx = agent.state_to_idx(env, state) + irl_reward = agent.get_reward(n_states, state_idx) + + agent.update_q_network(state, action, irl_reward, next_state, done) + agent.update_target_network() + + # State counting for densitiy + learner_feature_expectations += agent.feature_matrix[int(state_idx)] + + state = next_state + if done: + break + + scores.append(total_reward) + episode_arr.append(episode) + epsilon = max(epsilon * epsilon_decay, epsilon_min) + print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}") + + if episode % 1000 == 0 and episode != 0: + score_avg = np.mean(scores) + print('{} episode average score is {:.2f}'.format(episode, score_avg)) + plt.plot(episode_arr, scores, 'b') + plt.savefig(f"../learning_curves/maxent_{episodes}_{episode}_qnetwork.png") + learner = learner_feature_expectations / episode + plt.imshow(learner.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"../heatmap/learner_{episode}_deep.png") + plt.imshow(theta.reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"../heatmap/theta_{episode}_deep.png") + plt.imshow(feature_matrix.dot(theta).reshape((20, 20)), cmap='viridis', interpolation='nearest') + plt.savefig(f"../heatmap/rewards_{episode}_deep.png") + + torch.save(agent.q_network.state_dict(), f"../results/maxent_{episodes}_{episode}_network_main.pth") + + if episode == episodes - 1: + plt.plot(episode_arr, scores, 'b') + plt.savefig(f"../learning_curves/maxentdeep_{episodes}_qdeep_main.png") + + torch.save(agent.q_network.state_dict(), f"../results/maxentdeep_{episodes}_q_network_main.pth") + + +# Main function +if __name__ == "__main__": + env = gym.make('MountainCar-v0') + state_size = env.observation_space.shape[0] + action_size = 3 # env.action_space.n + + # Feature Matrix + n_states = 400 # 20 * 20 + one_feature = 20 # number of state per one feature + feature_matrix = np.eye(n_states) + + # Theta works as Rewards + theta_learning_rate = 0.01 + theta = -(np.random.uniform(size=(n_states,))) + + agent = DQNAgent(state_size, action_size, theta, feature_matrix, one_feature) + + demonstrations = agent.get_demonstrations(env) + expert = agent.expert_feature_expectations(demonstrations) + learner_feature_expectations = np.zeros(n_states) + + train(agent, env, expert, learner_feature_expectations, n_states)