In [1]:
# Problem 1: Behavior Cloning

In [2]:
"""
Strategy:
1. Collect expert rollouts
2. Create neural network that parameterizes the policy pi_theta (a_t | o_t)
3. Train network on rollout data (o_t = x_train, a_t = y_train)
4. Visualize performance of network
    - render the frames
    - measure similarity between distributions (cross entropy / KL div)

Notes:
- All expert envs have continuous actions
- Actions are continuous so we'll need to output logit scores
- We'll use the MSE loss 

"""

import gym
env = gym.make("Walker2d-v2")
print(env.action_space)
import torch
from sklearn.utils import shuffle


Box(6,)


# Behavioral Cloning

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim

import gym
import pickle
import numpy as np
import itertools
from sklearn.utils import shuffle

# follow this
# https://github.com/KuNyaa/berkeleydeeprlcourse-homework-pytorch-solution/blob/master/hw1/train.py

def print_summary(title,policy,env_name):
    env = gym.make(env_name)
    print('='*70)
    print(title)
    print('Action space:',env.action_space)
    print('Observation space:',env.observation_space)
    print('Environment:',env_name)
    print(policy)
    print('='*70)

def load_data(env_name,verbose=False):
    with open('./expert_data/'+env_name+'.pkl', 'rb') as f:
        data = pickle.load(f)

    X_train = data['observations']
    y_train = data['actions'].reshape(-1,act_dim)
    N = X_train.shape[0]
    val_ratio = 0.1
    rand_ids = np.random.choice(range(N),N,replace=False)
    M = int(val_ratio * N)
    
    val_ids = rand_ids[:M]
    #test_ids = rand_ids[M:2*M]
    train_ids = rand_ids[M:]
    
    #X_train -= X_train.mean(0).reshape(-1,1)
    #X_train /= X_train.std(0).reshape(-1,1)
    
    X_val = torch.tensor(X_train[val_ids]).float()
    y_val = torch.tensor(y_train[val_ids]).float()

    X_train = torch.tensor(X_train[train_ids]).float()
    y_train = torch.tensor(y_train[train_ids]).float()
    
    if verbose:
        print('='*70)
        print('LOADED EXPERT DATA')
        print('Data points',N)
        print('X_train',X_train.shape)
        print('y_train',y_train.shape)
        print('X_val',X_val.shape)
        print('X_val',y_val.shape)
        #print('X_test',X_test.shape)
        #print('y_test',y_test.shape)
        print('='*70)
    
    return X_train,y_train,X_val,y_val

class BCPolicy(nn.Module):
    
    def __init__(self,obs_dim,h_dim,act_dim):
        super(BCPolicy,self).__init__()
        
        
        self.layers = nn.Sequential(
            nn.Linear(obs_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, act_dim)
        )
        
    def forward(self,X):
        return self.layers(X)

def MSE_loss(y_pred,y):
    return torch.mean((y_pred-y)**2) 


title = 'Behavioral Cloning Neural Network'
env_name = "Ant-v2"
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

# hyperparameters
h_dim = 128
learning_rate = 1e-3
epochs = 20
batch_size = 2000
episodes = 10
max_steps = env._max_episode_steps

policy = BCPolicy(obs_dim,h_dim,act_dim)

optimizer = optim.Adam(policy.parameters(),lr=learning_rate)

print_summary(title,policy,env_name)

X_train,y_train,X_val,y_val = load_data(env_name,True)

y_pred_val = policy(X_val)
score = MSE_loss(y_pred_val,y_val)

#print('Validation set MSE')
#print(score)
#print('='*70)

N = X_train.shape[0]

mse_loss = nn.MSELoss()
# train and measure validation loss

for epoch in range(epochs):
    
    # sample minibatch
    rand_ids = np.random.choice(range(N),batch_size)
    X_batch = X_train#[rand_ids]
    y_batch = y_train#[rand_ids]
    y_pred = policy(X_batch)

    y_val_pred = policy(X_val)
    loss = mse_loss(y_pred, y_batch)
    #loss_val = mse_loss(y_val_pred, y_val).detach().numpy()
    
    print('Epoch',epoch,'Training loss',loss.detach().numpy())#,'Validation loss',loss_val)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
# test the policy

for episode in range(episodes):
    returns = 0.0
    s = env.reset()
    with torch.no_grad():
        for step in itertools.count():
            s = torch.tensor(s).float()
            a = policy(s).detach().numpy()
            s,r,d,_ = env.step(a)
            returns+=r
            if d:
                break
    print(returns)



Behavioral Cloning Neural Network
Action space: Box(8,)
Observation space: Box(111,)
Environment: Ant-v2
BCPolicy(
  (layers): Sequential(
    (0): Linear(in_features=111, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=8, bias=True)
  )
)
LOADED EXPERT DATA
Data points 196845
X_train torch.Size([177161, 111])
y_train torch.Size([177161, 8])
X_val torch.Size([19684, 111])
X_val torch.Size([19684, 8])
Epoch 0 Training loss 0.10178931
Epoch 1 Training loss 0.092089154
Epoch 2 Training loss 0.08420419
Epoch 3 Training loss 0.077312835
Epoch 4 Training loss 0.071023256
Epoch 5 Training loss 0.06513419


KeyboardInterrupt: 

In [34]:
import pickle, tensorflow as tf, tf_util, numpy as np

def load_policy(filename):
    with open(filename, 'rb') as f:
        data = pickle.loads(f.read())

    # assert len(data.keys()) == 2
    nonlin_type = data['nonlin_type']
    policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]

    assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
    policy_params = data[policy_type]

    assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}

    # Keep track of input and output dims (i.e. observation and action dims) for the user

    def build_policy(obs_bo):
        def read_layer(l):
            assert list(l.keys()) == ['AffineLayer']
            assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
            return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)

        def apply_nonlin(x):
            if nonlin_type == 'lrelu':
                return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
            elif nonlin_type == 'tanh':
                return tf.tanh(x)
            else:
                raise NotImplementedError(nonlin_type)

        # Build the policy. First, observation normalization.
        assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
        obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
        obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
        obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
        print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
        normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation

        curr_activations_bd = normedobs_bo

        # Hidden layers next
        assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
        layer_params = policy_params['hidden']['FeedforwardNet']
        for layer_name in sorted(layer_params.keys()):
            l = layer_params[layer_name]
            W, b = read_layer(l)
            curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)

        # Output layer
        W, b = read_layer(policy_params['out'])
        output_bo = tf.matmul(curr_activations_bd, W) + b
        return output_bo

    obs_bo = tf.placeholder(tf.float32, [None, None])
    a_ba = build_policy(obs_bo)
    policy_fn = tf_util.function([obs_bo], a_ba)
    return policy_fn

In [38]:
import tensorflow as tf
import gym

#import load_policy
import tf_util
print('loading and building expert policy')
policy_fn = load_policy('experts/Ant-v2.pkl')
print('loaded and built')

# test the policy
with tf.Session():
    tf_util.initialize()

    env = gym.make('Ant-v2')
    #max_steps = args.max_timesteps or env.spec.timestep_limit
    max_steps = env._max_episode_steps

    returns = []
    observations = []
    actions = []
    for i in range(10):
        print('iter', i)
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            print(obs.shape)
            print(obs[None,:].shape)
            print(model)
            assert False
            action = policy_fn(obs[None,:])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1

            #if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)
        print(totalr)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))


loading and building expert policy
obs (1, 111) (1, 111)
loaded and built
iter 0
(111,)
(1, 111)


AssertionError: 

In [43]:
import pickle, tensorflow as tf, tf_util, numpy as np

def load_policy(filename):
    with open(filename, 'rb') as f:
        data = pickle.loads(f.read())

    # assert len(data.keys()) == 2
    nonlin_type = data['nonlin_type']
    policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]

    assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
    policy_params = data[policy_type]

    assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}

    # Keep track of input and output dims (i.e. observation and action dims) for the user

    def build_policy(obs_bo):
        def read_layer(l):
            assert list(l.keys()) == ['AffineLayer']
            assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
            return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)

        def apply_nonlin(x):
            if nonlin_type == 'lrelu':
                return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
            elif nonlin_type == 'tanh':
                return tf.tanh(x)
            else:
                raise NotImplementedError(nonlin_type)

        # Build the policy. First, observation normalization.
        assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
        obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
        obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
        obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
        print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
        normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation

        curr_activations_bd = normedobs_bo

        # Hidden layers next
        assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
        layer_params = policy_params['hidden']['FeedforwardNet']
        for layer_name in sorted(layer_params.keys()):
            l = layer_params[layer_name]
            W, b = read_layer(l)
            curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)

        # Output layer
        W, b = read_layer(policy_params['out'])
        output_bo = tf.matmul(curr_activations_bd, W) + b
        return output_bo

    obs_bo = tf.placeholder(tf.float32, [None, None])
    a_ba = build_policy(obs_bo)
    policy_fn = tf_util.function([obs_bo], a_ba)
    return policy_fn

In [58]:
import os
import torch
import numpy as np
from torch import optim, nn
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader

def agent_wapper(config, agent):
    def fn(obs):
        with torch.no_grad():
            obs = obs.astype(np.float32)
            assert len(obs.shape) == 2
            obs = torch.from_numpy(obs).to(config.device)
            action = agent(obs)
        return action.cpu().numpy()
    return fn

def fit_dataset(config, agent, dataset, n_epochs):
    optimizer = optim.Adam(agent.parameters(), lr=config.lr, weight_decay=config.L2)
    loss_fn = nn.MSELoss()
    dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
    
    step = 0
    best_reward = None
    loss_his = []
    
    for k in range(n_epochs):
        for batch in dataloader:
            obs, gold_actions = batch
            pred_actions = agent(obs)
            loss = loss_fn(pred_actions, gold_actions)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_his.append(loss.item())

            if step % config.eval_steps == 0:
                avrg_mean, avrg_std = Eval(config, agent_wapper(config, agent))
                avrg_loss = np.mean(loss_his)
                loss_his = []
                print('[epoch {}  step {}] loss: {:.4f}  r_mean: {:.2f}  r_std: {:.2f}'.format(
                    k + 1, step, avrg_loss, avrg_mean, avrg_std))

                avrg_reward = avrg_mean - avrg_std
                if best_reward is None or best_reward < avrg_reward:
                    best_reward = avrg_reward
                    save_model(config, agent, config.model_save_path)
                
            step += 1
    
    load_model(config, agent, config.model_save_path)

def BehavioralCloning(config, agent, expert):

    # get expert demonstration
    expert_obs, expert_actions, *_ = run_agent(config, expert, config.n_expert_rollouts)
    expert_obs = torch.from_numpy(expert_obs).to(config.device)
    expert_actions = torch.from_numpy(expert_actions).to(config.device)
    dataset = TensorDataset(expert_obs, expert_actions)

    # training agent
    fit_dataset(config, agent, dataset, config.epochs)

    return agent_wapper(config, agent)

def DAgger(config, agent, expert):
    # get expert demonstration
    expert_obs, expert_actions, *_ = run_agent(config, expert, config.n_expert_rollouts)
    expert_obs = torch.from_numpy(expert_obs).to(config.device)
    expert_actions = torch.from_numpy(expert_actions).to(config.device)
    dataset = TensorDataset(expert_obs, expert_actions)

    for k in range(config.n_dagger_iter):
        # training agent
        fit_dataset(config, agent, dataset, config.epochs)
        
        # run agent to get new on-policy observations
        new_obs, *_ = run_agent(config, agent_wapper(config, agent), config.n_dagger_rollouts)
        expert_actions = expert(new_obs)
        
        new_obs = torch.from_numpy(new_obs).to(config.device)
        expert_actions = torch.from_numpy(expert_actions).to(config.device)
        new_data = TensorDataset(new_obs, expert_actions)
        
        # add new data to dataset
        dataset = ConcatDataset([dataset, new_data])
            

        avrg_mean, avrg_std = Eval(config, agent_wapper(config, agent))
        print('[DAgger iter {}] r_mean: {:.2f}  r_std: {:.2f}'.format(k + 1, avrg_mean, avrg_std))

        
    return agent_wapper(config, agent)
    
def run_agent(config, agent, num_rollouts):
    env = config.env
    max_steps = env.spec.timestep_limit
    o_dim = env.observation_space.shape[0]

    returns = []
    observations = []
    actions = []
    for _ in range(num_rollouts):
        obs = env.reset()
        done = False
        totalr = 0
        steps = 0
        while not done:
            print(obs.reshape(-1,o_dim).shape)
            print(agent)
            assert False
            action = agent(obs.reshape(-1,o_dim))
            action = action.reshape(-1)
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if steps >= max_steps:
                break
        returns.append(totalr)

    avrg_mean, avrg_std = np.mean(returns), np.std(returns)
    observations = np.array(observations).astype(np.float32)
    actions = np.array(actions).astype(np.float32)

    return observations, actions, avrg_mean, avrg_std

def Eval(config, agent):
    *_, avrg_mean, avrg_std = run_agent(config, agent, config.n_eval_rollouts)

    return avrg_mean, avrg_std


def save_model(config, model, PATH):
    if not os.path.exists(PATH):
        os.makedirs(PATH)
    PATH = PATH + config.envname + '-' + 'parameters.tar'
    torch.save(model.state_dict(), PATH)
    print('model saved.')

def load_model(config, model, PATH):
    PATH = PATH + config.envname + '-' + 'parameters.tar'
    model.load_state_dict(torch.load(PATH))
    print('model loaded.')

In [59]:
import gym
import torch
from torch import nn
#from load_policy import load_policy

class Config():
    seed = 3
    envname = 'Humanoid-v2'
    env = gym.make(envname)
    method = 'BC' # BC: Behavioral Cloning   DA: DAgger
    device = torch.device('cpu')
    expert_path = './experts/'
    model_save_path = './models/'
    n_expert_rollouts = 30 # number of rollouts from expert
    n_dagger_rollouts = 10 # number of new rollouts from learned model for a DAgger iteration
    n_dagger_iter = 10 # number of DAgger iterations
    n_eval_rollouts = 10 # number of rollouts for evaluating a policy
    L2 = 0.00001
    lr = 0.0001
    epochs = 20
    batch_size = 64

    eval_steps = 500

    

class Agent(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Agent, self).__init__()

        self.mlp = nn.Sequential(
            nn.Linear(in_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, 64),
            nn.ReLU(True),
            nn.Linear(64, out_dim),
        )

    def forward(self, obs):
        return self.mlp(obs)


def main():
    config = Config()
    print('*' * 20, config.envname, config.method, '*' * 20)
    env = config.env
    if config.seed:
        env.seed(config.seed)
        torch.manual_seed(config.seed)
    agent = Agent(env.observation_space.shape[0], env.action_space.shape[0]).to(config.device)
    expert = load_policy(config.expert_path + config.envname + '.pkl')
    method = config.method

    if method == 'BC':
        agent = BehavioralCloning(config, agent, expert)
    elif method == 'DA':
        agent = DAgger(config, agent, expert)
    else:
        NotImplementedError(method)

    
    avrg_mean, avrg_std = Eval(config, expert)
    print('[expert] avrg_mean:{:.2f}  avrg_std:{:.2f}'.format(avrg_mean, avrg_std))
        
    avrg_mean, avrg_std = Eval(config, agent)
    print('[agent] avrg_mean:{:.2f}  avrg_std:{:.2f}'.format(avrg_mean, avrg_std))

if __name__ == '__main__':
    main()

******************** Humanoid-v2 BC ********************
obs (1, 376) (1, 376)
(1, 376)


AssertionError: 