In [None]:
print(game.state.players[0], game.state.players[1], game.state.dealer)

In [None]:
%load_ext autoreload
%autoreload 2
from rlcard.games.thegame import TheGameGame, TheGamePlayer

players = [TheGamePlayer('Nico'), TheGamePlayer('Costy')]
game = TheGameGame(
    game_size=15,
    hand_size=6,
    min_cards=2,
    players=players,
    verbose=1
)

game.run()

## Play with agent

In [None]:
import rlcard

# Make environment and enable human mode
env = rlcard.make('leduc-holdem')

# Set it to human mode
env.set_mode(human_mode=True)

print(">> Leduc Hold'em pre-trained model")

# Reset environment
state = env.reset()

while True:
    action = input('>> You choose action (integer): ')
    while not action.isdigit() or int(action) not in state['legal_actions']:
        print('Action illegel...')
        action = input('>> Re-choose action (integer): ')
         
    state, _, _ = env.step(int(action))


In [1]:
import rlcard
from rlcard.agents.random_agent import RandomAgent

# Make environment
env = rlcard.make('limit-holdem')

agent = RandomAgent(action_num=env.action_num)

env.set_agents([agent, agent, agent])

trajectories, payoffs = env.run()


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [8]:
import rlcard
from rlcard.agents.random_agent import RandomAgent
from rlcard.utils.utils import set_global_seed

# Make environment
env = rlcard.make('blackjack')
episode_num = 2

# Set a global seed
set_global_seed(0)

# Set up agents
agent_0 = RandomAgent(action_num=env.action_num)
env.set_agents([agent_0])

for episode in range(episode_num):

    # Generate data from the environment
    trajectories, _ = env.run(is_training=False)

    # Print out the trajectories
    print('\nEpisode {}'.format(episode))
    for ts in trajectories[0]:
        print('State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'.format(ts[0], ts[1], ts[2], ts[3], ts[4]))


Episode 0
State: {'obs': array([21,  3]), 'legal_actions': [0, 1]}, Action: 0, Reward: 0, Next State: {'obs': array([15,  3]), 'legal_actions': [0, 1]}, Done: False
State: {'obs': array([15,  3]), 'legal_actions': [0, 1]}, Action: 1, Reward: -1, Next State: {'obs': array([15, 20]), 'legal_actions': [0, 1]}, Done: True

Episode 1
State: {'obs': array([15,  5]), 'legal_actions': [0, 1]}, Action: 1, Reward: 1, Next State: {'obs': array([15, 23]), 'legal_actions': [0, 1]}, Done: True


In [7]:
import tensorflow as tf
import os
import numpy as np

import rlcard
from rlcard.agents.dqn_agent import DQNAgent
from rlcard.agents.random_agent import RandomAgent
from rlcard.utils.utils import set_global_seed
from rlcard.utils.logger import Logger

# Make environment
env = rlcard.make('leduc-holdem')
eval_env = rlcard.make('leduc-holdem')

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 1000
evaluate_num = 10000
timesteps = 100000

# The intial memory size
memory_init_size = 1000

# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/leduc_holdem_single_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[128,128])
    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    state = env.reset()

    for timestep in range(timesteps):
        action = agent.step(state)
        next_state, reward, done = env.step(action)
        ts = (state, action, reward, next_state, done)
        agent.feed(ts)

        if timestep % evaluate_every == 0:
            rewards = []
            state = eval_env.reset()
            for _ in range(evaluate_num):
                action, _ = agent.eval_step(state)
                _, reward, done = env.step(action)
                if done:
                    rewards.append(reward)
            logger.log_performance(env.timestep, np.mean(rewards))

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot('DQN')
    
    # Save model
    save_dir = 'models/leduc_holdem_single_dqn'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    saver = tf.train.Saver()
    saver.save(sess, os.path.join(save_dir, 'model'))




TypeError: __init__() got an unexpected keyword argument 'train_every'

In [1]:
import rlcard


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [1]:
import torch
n = 10000
a = torch.rand(n, n)
b = torch.rand(n, n)
c = a.cuda()
d = b.cuda()

In [2]:
%time a@b

Wall time: 15.8 s


tensor([[2497.8679, 2508.6377, 2523.4033,  ..., 2469.9338, 2528.7549,
         2518.1609],
        [2459.0049, 2499.8340, 2527.2683,  ..., 2461.7351, 2518.4272,
         2501.4077],
        [2506.0278, 2526.7751, 2532.4163,  ..., 2486.4153, 2547.0908,
         2524.3271],
        ...,
        [2501.7178, 2528.1509, 2532.3638,  ..., 2468.3743, 2540.3877,
         2518.7964],
        [2477.8696, 2494.7334, 2508.3547,  ..., 2473.5212, 2541.2661,
         2511.5283],
        [2478.0259, 2506.8237, 2530.0471,  ..., 2483.1580, 2528.5720,
         2521.1538]])

In [3]:
%time c@d

Wall time: 1.13 s


tensor([[2497.8667, 2508.6411, 2523.4062,  ..., 2469.9275, 2528.7561,
         2518.1594],
        [2459.0076, 2499.8340, 2527.2708,  ..., 2461.7388, 2518.4268,
         2501.4011],
        [2506.0288, 2526.7732, 2532.4124,  ..., 2486.4226, 2547.0859,
         2524.3232],
        ...,
        [2501.7112, 2528.1499, 2532.3699,  ..., 2468.3735, 2540.3860,
         2518.7883],
        [2477.8711, 2494.7378, 2508.3528,  ..., 2473.5266, 2541.2656,
         2511.5308],
        [2478.0205, 2506.8279, 2530.0410,  ..., 2483.1538, 2528.5693,
         2521.1553]], device='cuda:0')

# Policy Evaluation (random policy)

In [66]:
import numpy as np
n = 4
rewards = -1 * np.ones((4, 4))
rewards[0, 0] = rewards[-1, -1] = 0

values = np.zeros((4, 4))

def expected_next_value(values, curr_state_x, curr_state_y):
    exp_value = 0
    if (curr_state_x==0 and curr_state_y==0) or (curr_state_x==n-1 and curr_state_y==n-1):
        return 0
    for i in [-1, 1]:
        new_state_x = np.clip(curr_state_x + i, 0, n-1)
        new_state_y = np.clip(curr_state_y + i, 0, n-1)
        # we end in new_state with 1/4 prob
        exp_value += 1/4*values[curr_state_x, new_state_y]
        exp_value += 1/4*values[new_state_x, curr_state_y]
    return exp_value

for x in range(500):
    new_values = np.zeros((4, 4))
    for i in range(n):
        for j in range(n):
            new_values[i, j] = rewards[i, j] + expected_next_value(values, i, j)
    values = new_values
print(values)

[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]
