In [1]:
%cd ..

/home/jovyan/personal_project/RL-tarot


In [2]:
"""
An example of learning a Deep-Q Agent on French Tarot Game
"""
import os
import time

import tensorflow as tf

import rlcard
from rlcard.agents.random_agent import RandomAgent
from rlcard.agents.dqn_agent import DQNAgent
from rlcard.utils.logger import Logger
from rlcard.utils.utils import set_global_seed, time_difference_good_format

record_number = 14

# Make environment
env = rlcard.make('tarot')
eval_env = rlcard.make('tarot')

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 5000
evolve_model_every = 20000
evaluate_num = 1000

episode_num = 200000

self_play = 1
total_self_play_eval = int(episode_num / evaluate_every)

# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 50000
norm_step = 10000

# The paths for saving the logs and learning curves
root_path = './experiments/tarot_dqn_self_played_v{}/'.format(str(record_number))
log_path_random = root_path + 'log_random.txt'
csv_path_random = root_path + 'performance_random.csv'
log_path_opponent = root_path + 'log_opponent.txt'
csv_path_opponent = root_path + 'performance_opponent.csv'
figure_path_random = root_path + 'figures_random/'
figure_path_opponent = root_path + 'figures_opponent/'

# Model save path
if not os.path.exists('rlcard/models'):
    os.makedirs('rlcard/models')
if not os.path.exists('rlcard/models/pretrained'):
    os.makedirs('rlcard/models/pretrained')
for self_play_init in range(1, total_self_play_eval + 1):
    model_folder_path = 'rlcard/models/pretrained/self_played_{}/tarot_v{}'.format(
        str(record_number),
        str(record_number * 10000 + self_play_init))
    if not os.path.exists(model_folder_path):
        os.makedirs(model_folder_path)
model_path = 'rlcard/models/pretrained/self_played_{}/tarot_v{}/model'.format(
    str(record_number),
    str(record_number * 10000 + self_play))

# Set a global seed
set_global_seed(0)

random_agent = RandomAgent(action_num=eval_env.action_num)

with tf.compat.v1.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=78,  # env.action_num,
                     replay_memory_size=20000,
                     replay_memory_init_size=memory_init_size,
                     norm_step=norm_step,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 1024, 512])

    opponent_agent = agent

    sess.run(tf.compat.v1.global_variables_initializer())

    saver = tf.compat.v1.train.Saver()

    env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
    eval_env.set_agents([agent] + [random_agent] * (env.player_num - 1))

    # Count the number of steps
    step_counter = 0

    # Init a Logger to plot the learning curve against random
    logger_random = Logger(xlabel='episode', ylabel='reward', legend='DQN on TAROT against Random',
                           legend_hist='Histogram of last evaluations against Random', log_path=log_path_random,
                           csv_path=csv_path_random)
    # Init a Logger to plot the learning curve against last opponent
    logger_opponent = Logger(xlabel='episode', ylabel='reward', legend='DQN on TAROT against last agent',
                             legend_hist='Histogram of last evaluations against last agent', log_path=log_path_opponent,
                             csv_path=csv_path_opponent)

    total_game_played = 0
    seconds = time.time()

    for episode in range(episode_num):
        print('\rEPISODE {} - Number of game played {} - {}'.format(episode, total_game_played,
                                                                    time_difference_good_format(seconds, time.time())),
              end='')

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)
        total_game_played += 1

        # Feed transitions into agent memory, and train the agent
        for ts in trajectories[0]:
            agent.feed(ts)
            step_counter += 1

            # Train the agent
            train_count = step_counter - (memory_init_size + norm_step)
            if train_count > 0:
                loss = agent.train()
                # print('\rINFO - Step {}, loss: {}'.format(step_counter, loss), end='')

        # Evaluate the performance.
        if episode % evaluate_every == 0:
            # Save Model
            model_path = 'rlcard/models/pretrained/self_played_{}/tarot_v{}/model'.format(
                str(record_number),
                str(record_number * 10000 + self_play))
                        
            saver.save(sess, model_path)

            # Eval against random
            reward_random = 0
            reward_random_list = []
            taking_list = []
            eval_env.set_agents([agent] + [random_agent] * (env.player_num - 1))
            for eval_episode in range(evaluate_num):
                print('\rEPISODE {} - Eval Random {} over {} - Number of game played {} - {}'.format(episode,
                                                                                                     eval_episode,
                                                                                                     evaluate_num,
                                                                                                     total_game_played,
                                                                                                     time_difference_good_format(
                                                                                                         seconds,
                                                                                                         time.time())),
                      end='')
                _, payoffs = eval_env.run(is_training=False)
                total_game_played += 1
                reward_random_list.append(payoffs[0])
                reward_random += payoffs[0]
                taking_list.append(eval_env.game.players[0].taking)

            logger_random.log('\n########## Evaluation Against Random - Episode {} ##########'.format(episode))
            logger_random.log(
                'Timestep: {} Average reward against random is {}'.format(env.timestep,
                                                                          float(reward_random) / evaluate_num))

            # Add point to logger
            logger_random.add_point(x=episode, y=float(reward_random) / evaluate_num)
            
            # Make plot
            logger_random.make_plot(save_path=figure_path_random + str(episode) + '.png')
            logger_random.make_plot_hist(save_path_1=figure_path_random + str(episode) + '_hist.png',
                                         save_path_2=figure_path_random + str(episode) + '_freq.png',
                                         reward_list=reward_random_list, taking_list=taking_list)

            # Eval against last agent
            reward_opponent = 0
            reward_opponent_list = []
            taking_list = []
            eval_env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
            for eval_episode in range(evaluate_num):
                print('\rEPISODE {} - Eval Opponent {} over {} - Number of game played {} - {}'.format(episode,
                                                                                                       eval_episode,
                                                                                                       evaluate_num,
                                                                                                       total_game_played,
                                                                                                       time_difference_good_format(
                                                                                                           seconds,
                                                                                                           time.time())),
                      end='')
                _, payoffs = eval_env.run(is_training=False)
                total_game_played += 1
                reward_opponent_list.append(payoffs[0])
                reward_opponent += payoffs[0]
                taking_list.append(eval_env.game.players[0].taking)

            logger_opponent.log('\n########## Evaluation Against Last Agent - Episode {} ##########'.format(episode))
            logger_opponent.log(
                'Timestep: {} Average reward against last agent is {}'.format(env.timestep,
                                                                              float(reward_opponent) / evaluate_num))

            # Add point to logger
            logger_opponent.add_point(x=episode, y=float(reward_opponent) / evaluate_num)

            # Make plot
            logger_opponent.make_plot(save_path=figure_path_opponent + str(episode) + '.png')
            logger_opponent.make_plot_hist(save_path_1=figure_path_opponent + str(episode) + '_hist.png',
                                           save_path_2=figure_path_opponent + str(episode) + '_freq.png',
                                           reward_list=reward_opponent_list, taking_list=taking_list)
            
        if episode % evolve_model_every == 0 and episode > 0:
            # GO to next step
            self_play += 1

            opponent_agent = agent
            env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
            logger_random.log('\n########## Changing model - Using Model {} as opponent ##########'.format(self_play))
            logger_opponent.log('\n########## Changing model - Using Model {} as opponent ##########'.format(self_play))


    # Make the final plot
    logger_random.make_plot(save_path=figure_path_random + 'final_' + str(episode) + '.png')
    logger_random.make_plot_hist(save_path_1=figure_path_random + str(episode) + '_hist.png',
                                 save_path_2=figure_path_random + str(episode) + '_freq.png',
                                 reward_list=reward_random_list, taking_list=taking_list)
    # Make the final plot
    logger_opponent.make_plot(save_path=figure_path_opponent + 'final_' + str(episode) + '.png')
    logger_opponent.make_plot_hist(save_path_1=figure_path_opponent + str(episode) + '_hist.png',
                                   save_path_2=figure_path_opponent + str(episode) + '_freq.png',
                                   reward_list=reward_opponent_list, taking_list=taking_list)


Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


EPISODE 0 - Eval Random 999 over 1000 - Number of game played 1000 - 32 seconds
########## Evaluation Against Random - Episode 0 ##########
Timestep: 79 Average reward against random is 1.607
EPISODE 0 - Eval Opponent 999 over 1000 - Number of game played 2000 - 1 minute and 57 seconds
########## Evaluation Against Last Agent - Episode 0 ##########
Timestep: 79 Average reward against last agent is 2.334
EPISODE 5000 - Eval Random 999 over 1000 - Number of game played 8000 - 16 minutes and 49 seconds
########## Evaluation Against Random - Episode 5000 ##########
Timestep: 399486 Average reward against random is 1.197
EPISODE 5000 - Eval Opponent 999 over 1000 - Number of game played 9000 - 18 minutes and 17 seconds
########## Evaluation Against Last Agent - Episode 5000 ##########
Timestep: 399486 Average reward against last agent is -0.465
EPISODE 10000 - Eval Random 999 over 1000 - Number of game played 15000 - 41 minutes and 48 seconds
########## Evaluation Against Random - Episode 1

in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 30000 - Eval Random 999 over 1000 - Number of game played 43000 - 2 hours, 19 minutes and 50 seconds
########## Evaluation Against Random - Episode 30000 ##########
Timestep: 2398793 Average reward against random is 1.898
EPISODE 30000 - Eval Opponent 999 over 1000 - Number of game played 44000 - 2 hours, 21 minutes and 13 seconds
########## Evaluation Against Last Agent - Episode 30000 ##########
Timestep: 2398793 Average reward against last agent is 1.26
EPISODE 35000 - Eval Random 999 over 1000 - Number of game played 50000 - 2 hours, 45 minutes and 5 secondss
########## Evaluation Against Random - Episode 35000 ##########
Timestep: 2799428 Average reward against random is 1.899
EPISODE 35000 - Eval Opponent 999 over 1000 - Number of game played 51000 - 2 hours, 46 minutes and 26 seconds
########## Evaluation Against Last Agent - Episode 35000 ##########
Timestep: 2799428 Average reward against last agent is 2.598


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 40000 - Eval Random 999 over 1000 - Number of game played 57000 - 3 hours, 10 minutes and 56 seconds
########## Evaluation Against Random - Episode 40000 ##########
Timestep: 3199401 Average reward against random is 1.865
EPISODE 40000 - Eval Opponent 999 over 1000 - Number of game played 58000 - 3 hours, 12 minutes and 20 seconds
########## Evaluation Against Last Agent - Episode 40000 ##########
Timestep: 3199401 Average reward against last agent is 0.738

########## Changing model - Using Model 3 as opponent ##########

########## Changing model - Using Model 3 as opponent ##########
EPISODE 45000 - Eval Random 999 over 1000 - Number of game played 64000 - 3 hours, 36 minutes and 52 seconds
########## Evaluation Against Random - Episode 45000 ##########
Timestep: 3599565 Average reward against random is 1.793
EPISODE 45000 - Eval Opponent 999 over 1000 - Number of game played 65000 - 3 hours, 38 minutes and 17 seconds
########## Evaluation Against Last Agent - Episode 45000 

in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 55000 - Eval Random 999 over 1000 - Number of game played 78000 - 4 hours, 30 minutes and 54 seconds
########## Evaluation Against Random - Episode 55000 ##########
Timestep: 4401569 Average reward against random is 2.117
EPISODE 55000 - Eval Opponent 999 over 1000 - Number of game played 79000 - 4 hours, 32 minutes and 17 seconds
########## Evaluation Against Last Agent - Episode 55000 ##########
Timestep: 4401569 Average reward against last agent is 0.926
EPISODE 60000 - Eval Random 999 over 1000 - Number of game played 85000 - 4 hours, 59 minutes and 6 secondss
########## Evaluation Against Random - Episode 60000 ##########
Timestep: 4802519 Average reward against random is 1.983
EPISODE 60000 - Eval Opponent 999 over 1000 - Number of game played 86000 - 5 hours, 0 minute and 31 secondsds
########## Evaluation Against Last Agent - Episode 60000 ##########
Timestep: 4802519 Average reward against last agent is -1.074

########## Changing model - Using Model 4 as opponent ####

in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 70000 - Eval Opponent 999 over 1000 - Number of game played 100000 - 5 hours, 59 minutes and 59 seconds
########## Evaluation Against Last Agent - Episode 70000 ##########
Timestep: 5604429 Average reward against last agent is 2.41


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 75000 - Eval Random 999 over 1000 - Number of game played 106000 - 6 hours, 28 minutes and 24 seconds
########## Evaluation Against Random - Episode 75000 ##########
Timestep: 6006571 Average reward against random is 2.067
EPISODE 75000 - Eval Opponent 999 over 1000 - Number of game played 107000 - 6 hours, 29 minutes and 44 seconds
########## Evaluation Against Last Agent - Episode 75000 ##########
Timestep: 6006571 Average reward against last agent is 2.182


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 80000 - Eval Random 999 over 1000 - Number of game played 113000 - 6 hours, 59 minutes and 3 secondss
########## Evaluation Against Random - Episode 80000 ##########
Timestep: 6408260 Average reward against random is 2.032
EPISODE 80000 - Eval Opponent 999 over 1000 - Number of game played 114000 - 7 hours, 0 minute and 30 secondsds
########## Evaluation Against Last Agent - Episode 80000 ##########
Timestep: 6408260 Average reward against last agent is -2.161

########## Changing model - Using Model 5 as opponent ##########

########## Changing model - Using Model 5 as opponent ##########
EPISODE 85000 - Eval Random 999 over 1000 - Number of game played 120000 - 7 hours, 31 minutes and 20 seconds
########## Evaluation Against Random - Episode 85000 ##########
Timestep: 6810161 Average reward against random is 1.911
EPISODE 85000 - Eval Opponent 999 over 1000 - Number of game played 121000 - 7 hours, 32 minutes and 46 seconds
########## Evaluation Against Last Agent - Episode 8

in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 90000 - Eval Opponent 999 over 1000 - Number of game played 128000 - 8 hours, 7 minutes and 19 seconds
########## Evaluation Against Last Agent - Episode 90000 ##########
Timestep: 7213004 Average reward against last agent is -3.331
EPISODE 95000 - Eval Random 999 over 1000 - Number of game played 134000 - 8 hours, 41 minutes and 47 seconds
########## Evaluation Against Random - Episode 95000 ##########
Timestep: 7613060 Average reward against random is 2.261


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 95000 - Eval Opponent 999 over 1000 - Number of game played 135000 - 8 hours, 43 minutes and 8 secondss
########## Evaluation Against Last Agent - Episode 95000 ##########
Timestep: 7613060 Average reward against last agent is 2.153


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 100000 - Eval Random 999 over 1000 - Number of game played 141000 - 9 hours, 16 minutes and 40 seconds
########## Evaluation Against Random - Episode 100000 ##########
Timestep: 8011342 Average reward against random is 2.359


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 100000 - Eval Opponent 999 over 1000 - Number of game played 142000 - 9 hours, 18 minutes and 1 secondds
########## Evaluation Against Last Agent - Episode 100000 ##########
Timestep: 8011342 Average reward against last agent is 1.884


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)



########## Changing model - Using Model 6 as opponent ##########

########## Changing model - Using Model 6 as opponent ##########
Instructions for updating:
Use standard file APIs to delete files with this prefix.
EPISODE 105000 - Eval Random 999 over 1000 - Number of game played 148000 - 9 hours, 52 minutes and 44 seconds
########## Evaluation Against Random - Episode 105000 ##########
Timestep: 8409742 Average reward against random is 2.367


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 105000 - Eval Opponent 999 over 1000 - Number of game played 149000 - 9 hours, 54 minutes and 5 secondss
########## Evaluation Against Last Agent - Episode 105000 ##########
Timestep: 8409742 Average reward against last agent is 2.329


in singular transformations; automatically expanding.
bottom=0, top=0
  ret = ax.set_ylim(*args, **kwargs)


EPISODE 110000 - Eval Random 999 over 1000 - Number of game played 155000 - 10 hours, 30 minutes and 16 seconds
########## Evaluation Against Random - Episode 110000 ##########
Timestep: 8808806 Average reward against random is 2.268
EPISODE 110000 - Eval Opponent 999 over 1000 - Number of game played 156000 - 10 hours, 31 minutes and 38 seconds
########## Evaluation Against Last Agent - Episode 110000 ##########
Timestep: 8808806 Average reward against last agent is 2.281
EPISODE 115000 - Eval Random 999 over 1000 - Number of game played 162000 - 11 hours, 10 minutes and 22 seconds
########## Evaluation Against Random - Episode 115000 ##########
Timestep: 9207617 Average reward against random is 2.163
EPISODE 115000 - Eval Opponent 999 over 1000 - Number of game played 163000 - 11 hours, 11 minutes and 43 seconds
########## Evaluation Against Last Agent - Episode 115000 ##########
Timestep: 9207617 Average reward against last agent is 2.138
EPISODE 116105 - Number of game played 16410

KeyboardInterrupt: 