Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# IRLwPython

<img src="logo/IRLwPython.jpg" width="200">
<img src="logo/IRLwPython.png" width="200">

Inverse Reinforcement Learning Algorithm implementation with python.

Expand Down
Binary file removed logo/IRLwPython.jpg
Binary file not shown.
Binary file added logo/IRLwPython.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
185 changes: 185 additions & 0 deletions src/irlwpython/MaxEntropyDeepIRL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import gym
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt


class ActorNetwork(nn.Module):
def __init__(self, num_inputs, num_output, hidden_size):
super(ActorNetwork, self).__init__()
self.fc1 = nn.Linear(num_inputs, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_output)

def forward(self, x):
x = nn.functional.relu(self.fc1(x))
x = nn.functional.relu(self.fc2(x))
return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x))


class CriticNetwork(nn.Module):
def __init__(self, num_inputs, hidden_size):
super(CriticNetwork, self).__init__()
self.fc1 = nn.Linear(num_inputs, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, 1)

self.theta_layer = nn.Linear(hidden_size, 3)

def forward(self, x):
x_ = nn.functional.relu(self.fc1(x))
x_ = nn.functional.relu(self.fc2(x_))
theta_ = self.theta_layer(x_)
return self.fc3(x_) + torch.matmul(theta_, x)


class MaxEntropyDeepIRL:
def __init__(self, target, state_dim, action_dim, learning_rate=0.001, gamma=0.99, num_epochs=1000):
self.target = target
self.state_dim = state_dim
self.action_dim = action_dim
self.learning_rate = learning_rate
# self.theta = torch.rand(state_dim + 1, requires_grad=True)
self.gamma = gamma
self.num_epochs = num_epochs
self.actor_network = ActorNetwork(state_dim, action_dim, 100)
self.critic_network = CriticNetwork(state_dim + 1, 100)
self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate)
self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate)

def get_reward(self, state, action):
state_action = list(state) + list([action])
state_action = torch.Tensor(state_action)
return self.critic_network(state_action)

def expert_feature_expectations(self, demonstrations):
feature_expectations = torch.zeros(self.state_dim)

for demonstration in demonstrations:
for state, _, _ in demonstration:
state_tensor = torch.tensor(state, dtype=torch.float32)
feature_expectations += state_tensor.squeeze()

feature_expectations /= demonstrations.shape[0]
return feature_expectations

def maxent_irl(self, expert, learner):
# Update critic network

self.optimizer_critic.zero_grad()

# Loss function for critic network
loss_critic = torch.nn.functional.mse_loss(learner, expert)
loss_critic.backward()

self.optimizer_critic.step()

def update_q_network(self, state_array, action, reward, next_state):
self.optimizer_actor.zero_grad()

state_tensor = torch.tensor(state_array, dtype=torch.float32)
next_state_tensor = torch.tensor(next_state, dtype=torch.float32)

q_values = self.actor_network(state_tensor)
# q_1 = self.actor_network(state_tensor)[action]
# q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor))
next_q_values = reward + self.gamma * self.actor_network(next_state_tensor)

loss_actor = nn.functional.mse_loss(q_values, next_q_values)
loss_actor.backward()
self.optimizer_actor.step()

def get_demonstrations(self):
env_low = self.target.observation_space.low
env_high = self.target.observation_space.high
env_distance = (env_high - env_low) / 20 # self.one_feature

raw_demo = np.load(file="expert_demo/expert_demo.npy")
demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
for x in range(len(raw_demo)):
for y in range(len(raw_demo[0])):
position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
state_idx = position_idx + velocity_idx * 20 # self.one_feature

demonstrations[x][y][0] = state_idx
demonstrations[x][y][1] = raw_demo[x][y][2]

return demonstrations

def train(self):
demonstrations = self.get_demonstrations()
expert = self.expert_feature_expectations(demonstrations)

learner_feature_expectations = torch.zeros(self.state_dim, requires_grad=True) # Add requires_grad=True
episodes, scores = [], []

for episode in range(self.num_epochs):
state, info = self.target.reset()
score = 0

if (episode != 0 and episode == 10) or (episode > 10 and episode % 5 == 0):
learner = learner_feature_expectations / episode
self.maxent_irl(expert, learner)

while True:
state_tensor = torch.tensor(state, dtype=torch.float32)

q_state = self.actor_network(state_tensor)
action = torch.argmax(q_state).item()
next_state, reward, done, _, _ = self.target.step(action)

irl_reward = self.get_reward(state, action)
self.update_q_network(state, action, irl_reward, next_state)

print("Q Actor Network", state, q_state)
print("Reward", reward, "IRL Reward", irl_reward)

learner_feature_expectations = learner_feature_expectations + state_tensor.squeeze()

print(expert)
print(learner_feature_expectations)

score += reward
state = next_state
if done:
scores.append(score)
episodes.append(episode)
break

if episode % 1 == 0:
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episode, score_avg))
plt.plot(episodes, scores, 'b')
plt.savefig("./learning_curves/maxent_30000_network.png")

torch.save(self.q_network.state_dict(), "./results/maxent_30000_q_network.pth")

def test(self):
episodes, scores = [], []

for episode in range(10):
state = self.target.reset()
score = 0

while True:
self.target.render()
state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

action = torch.argmax(self.q_network(state_tensor)).item()
next_state, reward, done, _, _ = self.target.step(action)

score += reward
state = next_state

if done:
scores.append(score)
episodes.append(episode)
plt.plot(episodes, scores, 'b')
plt.savefig("./learning_curves/maxent_test_30000_network.png")
break

if episode % 1 == 0:
print('{} episode score is {:.2f}'.format(episode, score))
10 changes: 8 additions & 2 deletions src/irlwpython/MaxEntropyIRL.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def maxent_irl(self, expert, learner, learning_rate):

# Clip theta
for j in range(len(self.theta)):
if self.theta[j] > 0:
if self.theta[j] > 0: # log values
self.theta[j] = 0

def update_q_table(self, state, action, reward, next_state):
Expand Down Expand Up @@ -101,9 +101,11 @@ def train(self, theta_learning_rate):
state = self.target.env_reset()
score = 0

# Mini-Batches ?
# Mini-Batches:
if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
# calculate density
learner = learner_feature_expectations / episode
# Maximum Entropy IRL step
self.maxent_irl(expert, learner, theta_learning_rate)

# One Step in environment
Expand All @@ -115,12 +117,16 @@ def train(self, theta_learning_rate):
# Run one timestep of the environment's dynamics.
next_state, reward, done, _, _ = self.target.env_step(action)

# get pseudo-reward and update q table
irl_reward = self.get_reward(self.n_states, state_idx)
next_state_idx = self.target.idx_to_state(next_state)
self.update_q_table(state_idx, action, irl_reward, next_state_idx)

# State counting for densitiy
learner_feature_expectations += self.get_feature_matrix()[int(state_idx)]

print(reward, irl_reward)

score += reward
state = next_state
if done:
Expand Down
Binary file added src/irlwpython/expert_demo/expert_demo.p
Binary file not shown.
Binary file modified src/irlwpython/learning_curves/maxent_30000.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified src/irlwpython/learning_curves/maxent_test_30000.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 15 additions & 4 deletions src/irlwpython/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

#from irlwpython import __version__

import gym

__author__ = "HokageM"
__copyright__ = "HokageM"
__license__ = "MIT"
Expand Down Expand Up @@ -74,18 +76,27 @@ def main(args):

gamma = 0.99
q_learning_rate = 0.03
theta_learning_rate = 0.05

# Theta works as Critic
theta_learning_rate = 0.05
theta = -(np.random.uniform(size=(n_states,)))

if args.render:
car = MountainCar(True, one_feature)
else:
car = MountainCar(False, one_feature)

#if args.deep:
# deep = MaxEntropyDeepIRL()
# deep.run()
if args.deep:

# Create MountainCar environment
env = gym.make('MountainCar-v0', render_mode="human")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Run MaxEnt Deep IRL using MountainCar environment
maxent_deep_irl_agent = MaxEntropyDeepIRL(env, state_dim, action_dim)
maxent_deep_irl_agent.train()
maxent_deep_irl_agent.test()

if args.training:
q_table = np.zeros((n_states, n_actions))
Expand Down
Binary file modified src/irlwpython/results/maxent_30000_table.npy
Binary file not shown.
46 changes: 46 additions & 0 deletions src/irlwpython/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# from: https://github.com/reinforcement-learning-kr/lets-do-irl/

import math
import torch
from torch.distributions import Normal


def get_action(mu, std):
action = torch.normal(mu, std)
action = action.data.numpy()
action_list = [0, 1, 2]
return min(action_list, key=lambda x: abs(x - action))


def get_entropy(mu, std):
dist = Normal(mu, std)
entropy = dist.entropy().mean()
return entropy


def log_prob_density(x, mu, std):
log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
- 0.5 * math.log(2 * math.pi)
return log_prob_density.sum(1, keepdim=True)


def get_reward(discrim, state, action):
print("Input get reward")
print("state", state)
print("action", action)

state = torch.Tensor(state)
action = torch.Tensor(action)
state_action = torch.cat([state, action])

print("HELP")
print("state", state)
print("action", action)
print("state_action", state_action)

with torch.no_grad():
return -math.log(discrim(state_action)[0].item())


def save_checkpoint(state, filename):
torch.save(state, filename)
Loading