In [1]:
"""
An example of learning a Deep-Q Agent on French Tarot Game
"""
import os
import time

import tensorflow as tf

import rlcard
from rlcard.agents.dqn_agent import DQNAgent
from rlcard.agents.random_agent import RandomAgent
from rlcard.utils.logger import Logger
from rlcard.utils.utils import set_global_seed, time_difference_good_format

# Make environment
env = rlcard.make('tarot')
eval_env = rlcard.make('tarot')

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100
save_plot_every = 100
evaluate_num = 100
episode_num = 10000

record_number = 9

# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 5000
norm_step = 1000

# The paths for saving the logs and learning curves
root_path = './experiments/tarot_dqn_result_v{}/'.format(str(record_number))
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Model save path
if not os.path.exists('rlcard/models'):
    os.makedirs('rlcard/models')
    if not os.path.exists('rlcard/models/pretrained'):
        os.makedirs('rlcard/models/pretrained')
        if not os.path.exists('rlcard/models/pretrained/tarot_v' + str(record_number)):
            os.makedirs('rlcard/models/pretrained/tarot_v' + str(record_number))
model_path = 'rlcard/models/pretrained/tarot_v' + str(record_number) + '/model'

# Set a global seed
set_global_seed(0)

with tf.compat.v1.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=78,  # env.action_num,
                     replay_memory_size=20000,
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 512])

    random_agent = RandomAgent(action_num=eval_env.action_num)

    sess.run(tf.compat.v1.global_variables_initializer())

    saver = tf.compat.v1.train.Saver()

    env.set_agents([agent] + [random_agent] * (env.player_num - 1))
    eval_env.set_agents([agent] + [random_agent] * (env.player_num - 1))

    # Count the number of steps
    step_counter = 0

    total_game_played = 0
    seconds = time.time()

    # Init a Logger to plot the learning curve
    logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on TAROT', log_path=log_path, csv_path=csv_path)

    for episode in range(episode_num):
        print('\rEPISODE {} - Number of game played {} - {}'.format(episode, total_game_played,
                                                                    time_difference_good_format(seconds, time.time())),
              end='')

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)
        total_game_played += 1

        # Feed transitions into agent memory, and train the agent
        for ts in trajectories[0]:
            agent.feed(ts)
            step_counter += 1

            # Train the agent
            train_count = step_counter - (memory_init_size + norm_step)
            if train_count > 0:
                loss = agent.train()
                #print('\rINFO - Step {}, loss: {}'.format(step_counter, loss), end='')

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            # Save Model
            saver.save(sess, model_path)
            reward = 0
            for eval_episode in range(evaluate_num):
                print('\rEPISODE {} - Eval {} over {} - Number of game played {} - {}'.format(episode, eval_episode,
                                                                                              evaluate_num,
                                                                                              total_game_played,
                                                                                              time_difference_good_format(
                                                                                                  seconds,
                                                                                                  time.time())),
                      end='')
                _, payoffs = eval_env.run(is_training=False)
                total_game_played += 1
                reward += payoffs[0]

            logger.log('\n########## Evaluation - Episode {} ##########'.format(episode))
            logger.log('Timestep: {} Average reward is {}'.format(env.timestep, float(reward) / evaluate_num))

            # Add point to logger
            logger.add_point(x=env.timestep, y=float(reward) / evaluate_num)

        # Make plot
        if episode % save_plot_every == 0 and episode > 0:
            logger.make_plot(save_path=figure_path + str(episode) + '.png')

    # Make the final plot
    logger.make_plot(save_path=figure_path + 'final_' + str(episode) + '.png')


tarot-rule-v1
rlcard.models.tarot_rule_models:TAROTRuleModelV1
tarot-bid-rule-v1
rlcard.models.tarot_bid_rule_models:TAROTBIDRuleModelV1
tarot-dog-rule-v1
rlcard.models.tarot_dog_rule_models:TAROTDOGRuleModelV1
Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


EPISODE 0 - Eval 99 over 100 - Number of game played 100 - 8 seconds
########## Evaluation - Episode 0 ##########
Timestep: 81 Average reward is 0.2
EPISODE 100 - Eval 99 over 100 - Number of game played 300 - 35 seconds
########## Evaluation - Episode 100 ##########
Timestep: 8088 Average reward is 3.38
EPISODE 200 - Eval 99 over 100 - Number of game played 500 - 1 minute and 20 seconds
########## Evaluation - Episode 200 ##########
Timestep: 16136 Average reward is 3.3
EPISODE 300 - Eval 99 over 100 - Number of game played 700 - 2 minutes and 27 seconds
########## Evaluation - Episode 300 ##########
Timestep: 24148 Average reward is 0.01
EPISODE 400 - Eval 99 over 100 - Number of game played 900 - 4 minutes and 1 secondds
########## Evaluation - Episode 400 ##########
Timestep: 32193 Average reward is 2.69
EPISODE 500 - Eval 99 over 100 - Number of game played 1100 - 5 minutes and 53 seconds
########## Evaluation - Episode 500 ##########
Timestep: 40288 Average reward is 2.34
EPISODE



EPISODE 2200 - Eval 99 over 100 - Number of game played 4500 - 1 hour, 22 minutes and 58 seconds
########## Evaluation - Episode 2200 ##########
Timestep: 177459 Average reward is 2.51
EPISODE 2300 - Eval 99 over 100 - Number of game played 4700 - 1 hour, 30 minutes and 14 seconds
########## Evaluation - Episode 2300 ##########
Timestep: 185514 Average reward is 2.69
EPISODE 2400 - Eval 99 over 100 - Number of game played 4900 - 1 hour, 37 minutes and 49 seconds
########## Evaluation - Episode 2400 ##########
Timestep: 193535 Average reward is 2.61
EPISODE 2500 - Eval 99 over 100 - Number of game played 5100 - 1 hour, 45 minutes and 39 seconds
########## Evaluation - Episode 2500 ##########
Timestep: 201574 Average reward is 2.44
EPISODE 2600 - Eval 99 over 100 - Number of game played 5300 - 1 hour, 53 minutes and 48 seconds
########## Evaluation - Episode 2600 ##########
Timestep: 209590 Average reward is 3.18
EPISODE 2700 - Eval 99 over 100 - Number of game played 5500 - 2 hours, 2 m