In [1]:
%load_ext autoreload

In [2]:
from __future__ import division

import numpy as np
from envs import MarsExplorerEnv
import matplotlib.pyplot as plt
import time
import matplotlib.cm as cm
import matplotlib.colors as colors
import os
import seaborn as sns
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
from matplotlib._png import read_png
import pickle as pkl
%matplotlib inline

import gym
from gym import spaces
# from gym.envs.classic_control import rendering
import tensorflow as tf
from scipy.special import logsumexp
from copy import deepcopy as copy
import random
np.set_printoptions(precision=8, suppress=True,threshold=np.nan)


import types
import uuid
import math
import tensorflow as tf
from scipy.misc import logsumexp
# from baselines import deepq
# import baselines.common.tf_util as U

from utils.min_norm_solvers_numpy import MinNormSolver, gradient_normalizers


from envs.environment_visualization_utils import plot_values, plot_mars, plot_reward_map, plot_texture_map, plot_tile_map
from envs.environment_setup_utils import get_mdp
from utils.tf_utils import save_tf_vars, load_tf_vars, os_setup
from utils.soft_q_learning import tabsoftq_learn_Qs, tabsoftq_gen_pol, generate_demonstrations, vectorize_rollouts
from utils.data_utils import initialize_scopes, load_scopes
from utils.learning_utils import generate_constraints, nn_vectorize_rollouts, get_rollout_indexes, sample_batch
from envs.environment_utils import featurize_states
from utils.demos_utils import get_demos
from utils.experiment_utils import current_milli_time
from utils.models import InverseDynamicsLearner


In [3]:
%autoreload 2

# Environment Setup

In [4]:
os_setup()
data_dir = os.path.join('data', '1.1')
out_dir = os.path.join("logs", "models", str(current_milli_time()))
mdp = get_mdp(0)

In [5]:
#visualization demos

#plot_reward_map(mdp)
#plot_texture_map(mdp)
#plot_tile_map(mdp)

# Soft-Q Learning for Demonstrations

Sanity check (env and softq)

In [6]:
#temp_Q = tabsoftq_learn_Qs(mdp, gamma=0.95)
#plot_mars(mdp, tabsoftq_gen_pol(temp_Q*50), Qs=temp_Q)


# Model Set Up

In [7]:
gamma = 0.99
alpha = 1e-4
beta1 = 0.9
beta2 = 0.999999
sq_td_err_penalty = 1
trans_penalty = 1
t_err_penalty = 1e0
q_err_penalty = 1e0
constraint_batch_size = 512

q_n_layers = 2
q_layer_size = 2048
q_activation = tf.nn.tanh
q_output_activation = None

dyn_n_layers = 1
dyn_layer_size = 256
dyn_activation = tf.nn.relu
dyn_output_activation = None


# Boltz-beta determines the "rationality" of the agent being modeled.
# Setting it to higher values corresponds to "pure rationality"
boltz_beta = 50

batch_size=512

In [8]:
###### UNCOMMENT WHEN FIRST RUNNING THIS NOTEBOOK #######
# initialize_scopes(data_dir)
q_scope, dyn_scope = load_scopes(data_dir)
sess = tf.Session()

mlp_params = {'q_n_layers':q_n_layers,
                  'q_layer_size':q_layer_size,
                  'q_activation': q_activation,
                  'q_output_activation':q_output_activation,
                  'dyn_n_layers':dyn_n_layers,
                  'dyn_layer_size':dyn_layer_size,
                  'dyn_activation':dyn_activation,
                  'dyn_output_activation':dyn_output_activation}

model = InverseDynamicsLearner(mdp, sess, mlp_params=mlp_params, boltz_beta=boltz_beta, gamma=gamma)

# IDL Experiments

## Demo setup

In [9]:
#DEMO Config
gamma_demo = 0.99
n_demos = 200
demo_time_steps = 40
temp_boltz_beta = 50

In [10]:
constraints, rollouts, train_idxes, val_demo_batch, true_qs, states, adt_samples = get_demos(
    mdp, gamma, temp_boltz_beta, n_demos, demo_time_steps)

## Weighted Optimization

In [None]:
# Coordinate Config
batch_size = 200
n_training_iters = 5000
horizon = 1000
# Config made up of ['nall', 'ntll', 'tde', 'tde_sg_q', 'tde_sg_t']
losses = [0,1,2]
weights = [1.0,1.0,0.2]

In [None]:
regime_params = {"horizon": horizon,
                 'slope_threshold':slope_threshold,
                 'switch_frequency': switch_frequency,
                 'initial_update': initial_update,
                 'update_progression':update_progression}

## Coordinate Descent

In [11]:
# Coordinate Config
batch_size = 200
n_training_iters = 5000
horizon = 1000
slope_threshold = 1e-4
switch_frequency = 500
# Config made up of ['nall', 'ntll', 'tde', 'tde_sg_q', 'tde_sg_t']
initial_update = [1]
update_progression = [[0,3],[1,4]]

In [12]:
regime_params = {"horizon": horizon,
                 'slope_threshold':slope_threshold,
                 'switch_frequency': switch_frequency,
                 'initial_update': initial_update,
                 'update_progression':update_progression}

In [13]:
model.initialize_training_regime("coordinate", regime_params=regime_params)

In [15]:
model.train(n_training_iters, rollouts, train_idxes, batch_size, constraints, val_demo_batch, out_dir, states, adt_samples)

[('nall', 4.2442017), ('ntll', 1.4745983), ('tde', 4.1750755)]
[('nall', 4.2442017), ('ntll', 0.6769766), ('tde', 4.17442)]


('logs/models/1556232426808/q_fn', 'logs/models/1556232426808/dyn_fn')

## Frank Wolfe

In [18]:
batch_size = 200
n_training_iters = 1000
loss_configurations = [[0,3],[1,4]]

In [19]:
regime_params = {'loss_configurations':loss_configurations}
model.initialize_training_regime("MGDA", regime_params=regime_params)

## Training

In [20]:
model.train(n_training_iters, rollouts, train_idxes, batch_size, constraints, val_demo_batch, out_dir)

[('nall', 16.04462), ('ntll', 1.8168842), ('tde', 5.465369)]


KeyboardInterrupt: 

## Visualization

In [16]:
from utils.metric_utils import get_all_dynamics_distances

In [None]:
out_dir = "experiments/logs/models/1556305513765/tab"



## Testing Training Results

In [16]:
# Sample full dynamics model
adt_probs = sess.run([adt_pred_dir], feed_dict={demo_tile_t_ph:adt_samples[:,0][np.newaxis].T,
                                                demo_act_t_ph:adt_samples[:,1][np.newaxis].T})[0]

NameError: name 'adt_pred_dir' is not defined

In [17]:
# Print learned dynamics
softmax(adt_probs)

NameError: name 'softmax' is not defined

In [18]:
# Sample full q-fn model
q_vals = sess.run([constraint_q_ts], feed_dict={constraint_obs_t_feats_ph:states})[0]
# q_vals = sess.run([demo_q_t], feed_dict={demo_obs_t_feats_ph:states})[0]

NameError: name 'constraint_q_ts' is not defined

In [19]:
# Plot learned q-values and true q-values
plot_values(mdp, q_vals)
plot_values(mdp, Qs)

NameError: name 'q_vals' is not defined

In [20]:
# More comprehensive debugging stuff

test_constraint_adt_pred_probs =  tf.placeholder(tf.float32, [None, n_dirs], name="tpp")
test_constraint_q_t_ph = tf.placeholder(tf.float32, [None], name="tcq")
test_constraint_q_tp1_ph = tf.placeholder(tf.float32, [None, n_dirs, n_act_dim], name="tcqp1")
test_constraint_v_tp1 = tf.reduce_logsumexp(test_constraint_q_tp1_ph, axis=2) #- np.log(5) 
test_adt_V = tf.multiply(test_constraint_v_tp1, test_constraint_adt_pred_probs)
test_adt_target = constraint_rew_t_ph + gamma * tf.reduce_sum(test_adt_V, axis=1)
indiv_errs = test_constraint_q_t_ph - test_adt_target
test_adt_td_err = tf.reduce_mean((test_constraint_q_t_ph - test_adt_target)**2)

NameError: name 'constraint_rew_t_ph' is not defined

In [21]:
# Q-fn Model Storage
q_net_path = os.path.join(data_dir, 'adt_true_q.tf')
save_tf_vars(sess, q_scope, q_net_path)

In [72]:
# Dynamics Model Storage
invadt_path = os.path.join(data_dir, 'adt_fw_t.tf')
save_tf_vars(sess, invadt_scope, invadt_path)

In [73]:
# Training data Storage
pkl.dump(mode_logs, open('adt_fw.pkl', 'wb'))