In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy as sc
import time
from functools import reduce

import tensorflow as tf
from tensorflow import keras

import rl_env

In [None]:
def create_environment(game_type='Hanabi-Full', num_players=2):
  """Creates the Hanabi environment.

  Args:
    game_type: Type of game to play. Currently the following are supported:
      Hanabi-Full: Regular game.
      Hanabi-Small: The small version of Hanabi, with 2 cards and 2 colours.
    num_players: Int, number of players to play this game.

  Returns:
    A Hanabi environment.
  """
  return rl_env.make(
      environment_name=game_type, num_players=num_players, pyhanabi_path=None)

In [69]:
def create_actor_lstm(num_actions, state_shape, num_steps, num_trajectories, num_layers=2, num_units=512):
    cells = [tf.keras.layers.LSTMCell(num_units), tf.keras.layers.LSTMCell(num_units)]
    
    x = tf.placeholder(tf.float32, shape=(num_trajectories, num_steps, state_shape), name='x')
    actions = tf.placeholder(tf.float32, shape=[None,num_actions], name='actions')
    outputs = keras.layers.RNN(cells, return_sequences=True)(x)
    
    logits = tf.layers.dense(outputs, units=num_actions, activation=None)
    y = tf.nn.softmax(logits)
    
    negative_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(labels=actions, logits=logits)
    
    advantage = tf.placeholder(tf.float32, shape=[None,], name='advantage')
    
    loss = tf.reduce_mean(negative_log_prob*advantage)#tf.reduce_mean(tf.multiply(negative_log_prob, advantage))
    optimizer = tf.train.AdamOptimizer(0.1)
    train = optimizer.minimize(loss)
    
    return y, x, train, loss, advantage, actions, negative_log_prob, outputs, logits

def create_critic_model(state_shape, num_units=512):
    model = keras.Sequential([
        keras.layers.Dense(num_units, input_dim=state_shape, activation=tf.nn.relu),
        keras.layers.Dense(num_units, activation=tf.nn.relu),
        keras.layers.Dense(1)
    ])
    model.compile(loss='mean_squared_error',
                optimizer='adam',
                metrics=['mean_squared_error'])
    return model

In [73]:
y, x, train, loss, advantage, actions, negative_log_prob, outputs, logits = create_actor_lstm(3, 7, 5, 10)
outputs, logits, negative_log_prob, y

(<tf.Tensor 'rnn_8/transpose_1:0' shape=(10, 5, 512) dtype=float32>,
 <tf.Tensor 'dense_7/BiasAdd:0' shape=(10, 5, 3) dtype=float32>,
 <tf.Tensor 'softmax_cross_entropy_with_logits_6/Reshape_2:0' shape=(10, 5) dtype=float32>,
 <tf.Tensor 'Softmax_6:0' shape=(10, 5, 3) dtype=float32>)

In [76]:
init = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init)

In [82]:
xx = np.random.choice([0,1,2], (1,5,7))
#xx = xx.reshape((10,5,-1,7))
xx.shape

(1, 5, 7)

In [83]:
sess.run(y, {x: xx})

ValueError: Cannot feed value of shape (1, 5, 7) for Tensor 'x_8:0', which has shape '(10, 5, 7)'

In [None]:
def get_action(sess, y, x, state, legal_moves):
    policy = sess.run(y, {x: [state]}).squeeze()
    # somehow only consider the legal actions
    policy_legal = np.full(policy.shape, -np.inf)
    policy_legal[legal_moves] = policy[legal_moves]
    policy_legal = sc.special.softmax(policy_legal)
    
    action = np.random.choice(policy_legal.shape[0], p=policy_legal)
    logprob = np.log(policy_legal[action])
    
    return action, logprob

def get_value_estimate(critic, state):
    return critic.predict(np.reshape(state, (1,-1))).squeeze()
    

In [None]:
def calculateRtsAndAts(rewards, baselines, gamma=0.95):
    Rts = []
    Ats = []
    
    for traj in range(len(rewards)):
        Rt = 0
        T = len(rewards[traj])
        Rts.append(np.zeros(T))
        Ats.append(np.zeros(T))
        for t in reversed(range(T)):
            Rt += rewards[traj][t]
            Rts[traj][t] = Rt
            At = Rt - baselines[traj][t]
            Ats[traj][t] = At
            
            Rt *= gamma
            
    return Rts, Ats

In [None]:
#actor = create_actor_model(env.num_moves(), env.vectorized_observation_shape()[0])
#critic = create_critic_model(env.vectorized_observation_shape()[0])
#rnn = create_actor_lstm(0, 658, 10, 50)
#actor.summary()
#critic.summary()

In [None]:
num_iterations = 1
num_trajectories = 1000
t0 = time.time()


for it in range(num_iterations):
    rewards = np.zeros((num_trajectories, 110))
    ep_lens = np.zeros(num_trajectories).astype(int)


    # collect trajectories
    for traj in range(num_trajectories):
        # reset environment
        observations = env.reset()
        reward = 0
        is_done = False

        # collect one trajectory
        for st in range(110):
            # extract legal moves and state from observations
            moves = observations['player_observations'][st % 2]['legal_moves_as_int']
            state = observations['player_observations'][st % 2]['vectorized']

            # get next action
            action, _ = get_action_NEW(sess, y, x, state, moves)

            # do next step
            observations, reward, is_done, _ = env.step(action)

            # store reward after executing action
            rewards[traj, st] = reward


            if (is_done or st == 110-1):
                ep_lens[traj] = st
                #print("Done", ep, st)
                #print(observations['player_observations'][0]['pyhanabi'])
                #print('reward', reward)
                #print(' --- ')
                break

    
t1 = time.time()
print(rewards.shape, ep_lens.shape, t1-t0)

In [None]:
np.min(rewards, axis=1).mean()

In [None]:
num_iterations = 1
num_trajectories = 50
num_steps = 5
t0 = time.time()

# create hanabi environment
env = create_environment()
# init simple keras critic 
critic = create_critic_model(env.vectorized_observation_shape()[0])
# init actor neural net (a custom tf graph)
y, x, train, loss, advantage, actions = create_actor_lstm(env.num_moves(), env.vectorized_observation_shape()[0], 
                                                          num_steps, num_trajectories)
init = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init)

losses = []
for it in range(num_iterations):
    if(it % 50 == 0):
        tt = time.time()
        print('iteration #', it, ', time passed: ', tt-t0)
    
    actions_ = np.zeros((num_trajectories, 110, env.num_moves())) 
    states = np.zeros((num_trajectories, 110, env.vectorized_observation_shape()[0]))
    rewards = np.zeros((num_trajectories, 110))
    baselines = np.zeros((num_trajectories, 110))
    logprobs = np.zeros((num_trajectories, 110))
    ep_lens = np.zeros(num_trajectories)


    # collect trajectories
    for traj in range(num_trajectories):
        # reset environment
        observations = env.reset()
        reward = 0
        is_done = False
                       
        # collect one trajectory
        for st in range(110):
            # extract legal moves and state from observations
            moves = observations['player_observations'][st % 2]['legal_moves_as_int']
            state = observations['player_observations'][st % 2]['vectorized']

            # get next action
            action, logprob = get_action(sess, y, x, state, moves)

            # store variables
            action_[traj, step, action] = 1
            states[traj, step] = state)
            logprobs[traj, step] = logprob

            # get baseline(value estimate)
            baselines[traj, step] = get_value_estimate(critic, state)

            # do next step
            observations, reward, is_done, _ = env.step(action)

            # store reward after executing action
            rewards[traj, step] = reward


            if (is_done or st == 110-1):
                ep_lens[traj] = st
                #print("Done", ep, st)
                #print(observations['player_observations'][0]['pyhanabi'])
                #print('reward', reward)
                #print(' --- ')
                break

    # collected a bunch of trajectories, now calculate discounted rewards (Rts)
    Rts, Ats = calculateRtsAndAts(rewards, baselines)
    # 'flatten' the lists
    #x_train = np.array(reduce(lambda x,y: np.vstack((x,y)), states))
    #adv = np.array(reduce(lambda x,y: np.hstack((x,y)), Ats))
    #adv -= adv.mean()
    #adv /= adv.std() + 10**-10
    rew = np.array(reduce(lambda x,y: np.hstack((x,y)), Rts))
    #acts = np.array(reduce(lambda x,y: np.vstack((x,y)), actions_))
    
    # train critic and actor
    
    #critic.train_on_batch(x_train, rew)
    rew -= rew.mean()
    rew /= rew.std() + 10**-10
    
    max_ep_len = np.max(ep_lens)
    num_batches = np.ceil(max_ep_len / num_steps).astype(int)
    for b in range(num_batches):
        start = b * num_steps
        end = start + num_steps
        
        x_train = states[:, start:end]
        rew = Rts[:, start:end]
        acts = actions[:, start:end]
        
    
    
    _, loss_t = sess.run((train, loss), {x: x, advantage: rew, actions: acts})
    losses.append(loss_t)
    
    
t1 = time.time()
print(len(rewards), len(ep_lens), len(losses), t1-t0)

In [None]:
critic.predict(np.reshape(state, (1,-1))).squeeze()

In [None]:
observations = env.reset()
state = observations['player_observations'][0]['vectorized']
critic.predict(np.reshape(state, (1,-1))).squeeze()

In [None]:
plt.figure(0, (18,10))
plt.plot(list(map(lambda x: -x,losses)))
plt.show()

In [None]:
plt.figure(0, (18,10))
#plt.ylim((-100,60000))
plt.plot(list(map(lambda x: -x,losses)))
plt.show()

In [62]:
b =
start = b * 3
end = start + 3
a = np.arange(11)[start:end]
if (len(a) < 3):
    a = np.concatenate((a, np.zeros(3-len(a))))
a

array([0., 0., 0.])