In [None]:
import os
import gym
import numpy as np
import tensorflow as tf

# Killing optional CPU driver warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

class Reinforce(tf.keras.Model):

    def __init__(self, state_size, num_actions):
        """
        The Reinforce class that inherits from tf.keras.Model
        The forward pass calculates the policy for the agent given a batch of states.

        """
        super(Reinforce, self).__init__()
        self.num_actions = num_actions

        # TODO: Define network parameters and optimizer

        self.optimizer = tf.optimizers.Adam()
        self.D1 = tf.keras.layers.Dense(32)
        self.D2 = tf.keras.layers.Dense(32)
        self.D3 = tf.keras.layers.Dense(num_actions)

    def call(self, states):
        """
        Performs the forward pass on a batch of states to generate the action probabilities.
        This returns a policy tensor of shape [episode_length, num_actions], where each row is a
        probability distribution over actions for each state.

        :param states: An [episode_length, state_size] dimensioned array
        representing the history of states of an episode
        :return: A [episode_length,num_actions] matrix representing the probability distribution over actions
        for each state in the episode
        """

        logits = self.D1(states)
        logits = self.D2(logits)
        logits = self.D3(logits)
        probs = tf.nn.softmax(logits)

        return probs

    def loss(self, states, actions, discounted_rewards):
        """
        Computes the loss for the agent. Make sure to understand the handout clearly when implementing this.

        :param states: A batch of states of shape [episode_length, state_size]
        :param actions: History of actions taken at each timestep of the episode (represented as an [episode_length] array)
        :param discounted_rewards: Discounted rewards throughout a complete episode (represented as an [episode_length] array)
        :return: loss, a Tensorflow scalar
        """

        probs = self.call(np.array(states))
        actions = tf.expand_dims(actions,1)

        logged = tf.math.log(tf.gather_nd(probs, actions, 1))

        return -tf.reduce_sum(logged * discounted_rewards)

class Reinforce2(tf.keras.Model):
    def __init__(self, state_size, num_actions):
        """
        The Reinforce class that inherits from tf.keras.Model
        The forward pass calculates the policy for the agent given a batch of states.
        :param num_actions: number of actions in an environment
        """
        super(Reinforce2, self).__init__()
        self.num_actions = num_actions


        self.optimizer = tf.optimizers.Adam()
        self.D1 = tf.keras.layers.Dense(32)
        self.D2 = tf.keras.layers.Dense(8)
        self.D3 = tf.keras.layers.Dense(num_actions)

    def call(self, states):
        """
        Performs the forward pass on a batch of states to generate the action probabilities.
        This returns a policy tensor of shape [episode_length, num_actions], where each row is a
        probability distribution over actions for each state.

        :param states: An [episode_length, state_size] dimensioned array
        representing the history of states of an episode
        :return: A [episode_length,num_actions] matrix representing the probability distribution over actions
        for each state in the episode
        """

        logits = self.D1(states)
        logits = self.D2(logits)
        logits = self.D3(logits)
        probs = tf.nn.softmax(logits)

        return probs

    def loss(self, states, actions, discounted_rewards):
        """
        Computes the loss for the agent. Make sure to understand the handout clearly when implementing this.

        :param states: A batch of states of shape [episode_length, state_size]
        :param actions: History of actions taken at each timestep of the episode (represented as an [episode_length] array)
        :param discounted_rewards: Discounted rewards throughout a complete episode (represented as an [episode_length] array)
        :return: loss, a Tensorflow scalar
        """

        probs = self.call(np.array(states))
        actions = tf.expand_dims(actions,1)

        logged = tf.math.log(tf.gather_nd(probs, actions, 1))

        return -tf.reduce_sum(logged * discounted_rewards)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Started the round 1
Street "preflop" started. (community card = [])
SMALLSTACK/POT
3
198
"p2" declared "call:2"
SMALLSTACK/POT
4
198
"p1" declared "call:2"
Street "flop" started. (community card = ['SQ', 'SA', 'CK'])
SMALLSTACK/POT
4
198
"p2" declared "call:0"
SMALLSTACK/POT
4
198
"p1" declared "call:0"
Street "turn" started. (community card = ['SQ', 'SA', 'CK', 'H3'])
SMALLSTACK/POT
4
198
"p2" declared "call:0"
SMALLSTACK/POT
4
198
"p1" declared "call:0"
Street "river" started. (community card = ['SQ', 'SA', 'CK', 'H3', 'S3'])
SMALLSTACK/POT
4
198
"p2" declared "call:0"
SMALLSTACK/POT
4
198
"p1" declared "call:0"
"['p2']" won the round 1 (stack = {'p1': 198, 'p2': 202})
Started the round 2
Street "preflop" started. (community card = [])
SMALLSTACK/POT
3
197
"p1" declared "call:2"
SMALLSTACK/POT
4
196
"p2" declared "call:2"
Street "flop" started. (community card = ['CQ', 'C2', 'H4'])
SMAL

In [None]:
#TODO figure out action output space and state input space


#https://github.com/ishikota/PyPokerEngine
#https://ishikota.github.io/PyPokerEngine/

!pip install pypokerengine

from pypokerengine.players import BasePokerPlayer

from pypokerengine.utils.game_state_utils import restore_game_state

STATE_SIZE = 369
STATE_SIZE2 = 369
ACTIONS = 6

model1 = ReinforceWithBaseline(STATE_SIZE2, ACTIONS)
model2 = ReinforceWithBaseline(STATE_SIZE, ACTIONS)

totalactions = {'call': 0, 'fold': 0, 'raise': 0}
totalactions1 = {'call': 0, 'fold': 0, 'raise': 0}

totalnetgain = 0

learningrate = 0.005

beta = 1.0

class FishPlayer(BasePokerPlayer):  # Do not forget to make parent class as "BasePokerPlayer"

    name = ""
    startstack = 0
    endstack = 0

    rewards = []
    states = []
    actions = []

    count = 0

    #  we define the logic to make an action through this method. (so this method would be the core of your AI)
    def declare_action(self, valid_actions, hole_card, round_state):

        card_ref = ['2','3','4','5','6','7','8','9','T','J','Q','K','A']

        init_array = [np.zeros((4,13)) for i in range(0,5)]
        for indexx, card in enumerate(round_state['community_card']):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[indexx][0][face] = 1
          elif card[0] == 'D':
            init_array[indexx][1][face] = 1
          elif card[0] == 'C':
            init_array[indexx][2][face] = 1
          elif card[0] == 'S':
            init_array[indexx][3][face] = 1

        '''init_array = np.zeros(25)
        for indexx, card in enumerate(round_state['community_card']):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[0 + 5*indexx] = 1
          elif card[0] == 'D':
            init_array[1 + 5*indexx] = 1
          elif card[0] == 'C':
            init_array[2 + 5*indexx] = 1
          elif card[0] == 'S':
            init_array[3 + 5*indexx] = 1

          init_array[4 + 5*indexx] = face'''
        #create flattened array

        flattened = np.array(init_array).flatten()

        init_array = [np.zeros((4,13)) for i in range(0,2)]
        for indexx, card in enumerate(hole_card):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[indexx][0][face] = 1
          elif card[0] == 'D':
            init_array[indexx][1][face] = 1
          elif card[0] == 'C':
            init_array[indexx][2][face] = 1
          elif card[0] == 'S':
            init_array[indexx][3][face] = 1
        #create flattened array

        init_array = np.array(init_array)
        #create flattened array

        flattened = np.concatenate([init_array.flatten(), flattened])

        #allocate bigblind/pot and small stack/pot ratio vectors
        bigblindratio = np.zeros((1,10))
        smallstackratio = np.zeros((1,10))

        #find big blind stack, smallest stack
        pot = round_state['pot']['main']['amount']
        bbstack = round_state['seats'][round_state['big_blind_pos']]['stack']
        smallstack = min([round_state['seats'][i]['stack'] for i in range(0,len(round_state['seats']))])

        pos = np.zeros(1)

        if round_state['big_blind_pos'] == 0:
          pos[0] = 1

        street = np.zeros(4)

        if round_state['street'] == 'preflop':
          street[0] = 1

        if round_state['street'] == 'flop':
          street[1] = 1

        if round_state['street'] == 'turn':
          street[2] = 1

        if round_state['street'] == 'river':
          street[3] = 1

        #add this information to the gamestate vector
        flattened = np.concatenate([flattened, pos, street])

        #get information on most recent action:
        action_set = ['SMALLBLIND', 'BIGBLIND', 'CALL', 'FOLD', 'RAISE']

        if len(round_state['action_histories'][round_state['street']]) != 0:
          last_move = round_state['action_histories'][round_state['street']][-1]
          last_move_array = np.zeros((1,2))
          last_move_array[0][0] = action_set.index(last_move['action'])
          last_move_array[0][1] = last_move['amount']

        else:
          last_move_array = np.zeros((1,2))

        #add information on most recent action to feature vector
        #state = np.concatenate([flattened, last_move_array.flatten()])
        state = flattened

        #print(state)
        print(len(state))

        probs = model1.call(tf.expand_dims(state, 0))

        p2 = np.array(probs)[0]

        print(p2)

        #p2 = np.exp(beta * p2)/np.sum(np.exp(beta * p2))
        p2 /= p2.sum()

        print("p1 probs: ")
        print(p2)

        output = np.random.choice([0,1,2,3,4,5], 1, p=p2)

        action = 'raise'

        amount = 0

        print(self.name + " hole cards: ")
        print(hole_card)
        print(valid_actions)

        if output == 0:

          action = 'fold'

          if valid_actions[1]['amount'] == 0:

            action = 'call'

        elif output == 1:

          action = 'call'
          amount = valid_actions[1]['amount']

        elif output == 2:

          amount = (valid_actions[2]['amount']['min']) + (0.25 * round_state['pot']['main']['amount'])

        elif output == 3:

          amount = (valid_actions[2]['amount']['min']) + (0.5 * round_state['pot']['main']['amount'])

        elif output == 4:

          amount = (valid_actions[2]['amount']['min']) + (0.75 * round_state['pot']['main']['amount'])

        elif output == 5:

          amount = (valid_actions[2]['amount']['min']) + (round_state['pot']['main']['amount'])

        if (action == 'raise' and valid_actions[2]['amount']['max'] == -1):

          action = 'call'
          amount = valid_actions[1]['amount']

        if (action == "raise" and amount > valid_actions[2]['amount']['max']):
          amount = valid_actions[2]['amount']['max']

        totalactions[action] += 1

        self.states.append(state)
        self.actions.append(output)
        self.rewards.append(0)

        return action, amount   # action returned here is sent to the poker engine

    def receive_game_start_message(self, game_info):
        pass

    def receive_round_start_message(self, round_count, hole_card, seats):

        if self.count == 0:

          self.actions = []
          self.states = []
          self.rewards = []

        if (seats[0]['name'] == self.name):
          self.startstack = seats[0]['stack']

        if (seats[1]['name'] == self.name):
          self.startstack = seats[1]['stack']

    def receive_street_start_message(self, street, round_state):
        pass

    def receive_game_update_message(self, action, round_state):
        pass

    def receive_round_result_message(self, winners, hand_info, round_state):

        if len(self.rewards) == 0:
          return

        if not self.rewards[len(self.rewards)-1] == 0:
          return

        seats = round_state['seats']

        if (seats[0]['name'] == self.name):

          if round_state['small_blind_pos'] == 0:
            self.startstack += 1
          else:
            self.startstack += 2

          self.endstack = seats[0]['stack']
          self.rewards[len(self.rewards)-1] = (self.endstack - self.startstack) * learningrate

        if (seats[1]['name'] == self.name):

          if round_state['small_blind_pos'] == 1:
            self.startstack += 1
          else:
            self.startstack += 2

          self.endstack = seats[1]['stack']
          self.rewards[len(self.rewards)-1] = (self.endstack - self.startstack) * learningrate


        print("p1 rewards: ")
        print(self.rewards)

        print("p1 actions map: ")
        print(totalactions)

        global totalnetgain
        totalnetgain += self.rewards[len(self.rewards) - 1]

        print("TOTALNETGAIN p1: ")
        print(totalnetgain)

        if self.count > 96:
          with tf.GradientTape() as tape:

            myloss = model1.loss(self.states, self.actions, self.rewards)

          gradients = tape.gradient(myloss, model1.trainable_variables)
          model1.optimizer.apply_gradients(zip(gradients, model1.trainable_variables))

          self.count = 0
        else:
          self.count += 1

    def __init__(self, n):

      self.name = n


class FishPlayer2(BasePokerPlayer):  # Do not forget to make parent class as "BasePokerPlayer"

    name = ""
    startstack = 0
    endstack = 0

    rewards = []
    states = []
    actions = []

    count = 0

    #  we define the logic to make an action through this method. (so this method would be the core of your AI)
    def declare_action(self, valid_actions, hole_card, round_state):
        # valid_actions format => [raise_action_info, call_action_info, fold_action_info]

        card_ref = ['2','3','4','5','6','7','8','9','T','J','Q','K','A']

        init_array = [np.zeros((4,13)) for i in range(0,5)]
        for indexx, card in enumerate(round_state['community_card']):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[indexx][0][face] = 1
          elif card[0] == 'D':
            init_array[indexx][1][face] = 1
          elif card[0] == 'C':
            init_array[indexx][2][face] = 1
          elif card[0] == 'S':
            init_array[indexx][3][face] = 1

        '''init_array = np.zeros(25)
        for indexx, card in enumerate(round_state['community_card']):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[0 + 5*indexx] = 1
          elif card[0] == 'D':
            init_array[1 + 5*indexx] = 1
          elif card[0] == 'C':
            init_array[2 + 5*indexx] = 1
          elif card[0] == 'S':
            init_array[3 + 5*indexx] = 1

          init_array[4 + 5*indexx] = face'''

        #create flattened array
        init_array = np.array(init_array)

        flattened = init_array.flatten()

        init_array = [np.zeros((4,13)) for i in range(0,2)]
        for indexx, card in enumerate(hole_card):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[indexx][0][face] = 1
          elif card[0] == 'D':
            init_array[indexx][1][face] = 1
          elif card[0] == 'C':
            init_array[indexx][2][face] = 1
          elif card[0] == 'S':
            init_array[indexx][3][face] = 1
        #create flattened array

        init_array = np.array(init_array)

        flattened = np.concatenate([init_array.flatten(), flattened])

        #allocate bigblind/pot and small stack/pot ratio vectors
        bigblindratio = np.zeros((1,10))
        smallstackratio = np.zeros((1,10))

        #find big blind stack, smallest stack
        pot = round_state['pot']['main']['amount']
        bbstack = round_state['seats'][round_state['big_blind_pos']]['stack']
        smallstack = min([round_state['seats'][i]['stack'] for i in range(0,len(round_state['seats']))])

        #add this information to the gamestate vector
        #flattened = np.concatenate([flattened, np.array([pot, smallstack])])

        #get information on most recent action:
        action_set = ['SMALLBLIND', 'BIGBLIND', 'CALL', 'FOLD', 'RAISE']

        if len(round_state['action_histories'][round_state['street']]) != 0:
          last_move = round_state['action_histories'][round_state['street']][-1]
          last_move_array = np.zeros((1,2))
          last_move_array[0][0] = action_set.index(last_move['action'])
          last_move_array[0][1] = last_move['amount']

        else:
          last_move_array = np.zeros((1,2))

        #add information on most recent action to feature vector

        pos = np.zeros(1)

        if round_state['big_blind_pos'] == 1:
          pos[0] = 1

        street = np.zeros(4)

        if round_state['street'] == 'preflop':
          street[0] = 1

        if round_state['street'] == 'flop':
          street[1] = 1

        if round_state['street'] == 'turn':
          street[2] = 1

        if round_state['street'] == 'river':
          street[3] = 1

        #add this information to the gamestate vector
        state = np.concatenate([flattened, pos, street])

        probs = model2.call(tf.expand_dims(state, 0))[0]

        p2 = np.array(probs)

        print(p2)
        #p2 = np.exp(beta * p2)/np.sum(np.exp(beta * p2))
        p2 /= p2.sum()

        print("p2 probs: ")
        print(p2)

        output = np.random.choice([0,1,2,3,4,5], 1, p=p2)

        action = 'raise'
        amount = 0

        print(self.name + " hole cards: ")
        print(hole_card)
        print(valid_actions)

        if output == 0:

          action = 'fold'

          if valid_actions[1]['amount'] == 0:

            action = 'call'

        elif output == 1:

          action = 'call'
          amount = valid_actions[1]['amount']

        elif output == 2:

          amount = (valid_actions[2]['amount']['min']) + (0.25 * round_state['pot']['main']['amount'])

        elif output == 3:

          amount = (valid_actions[2]['amount']['min']) + (0.5 * round_state['pot']['main']['amount'])

        elif output == 4:

          amount = (valid_actions[2]['amount']['min']) + (0.75 * round_state['pot']['main']['amount'])

        elif output == 5:

          amount = (valid_actions[2]['amount']['min']) + (round_state['pot']['main']['amount'])


        if (action == 'raise' and valid_actions[2]['amount']['max'] == -1):

          action = 'call'
          amount = valid_actions[1]['amount']

        if (action == "raise" and amount > valid_actions[2]['amount']['max']):
          amount = valid_actions[2]['amount']['max']

        totalactions1[action] += 1

        self.states.append(state)
        self.actions.append(output)
        self.rewards.append(0)

        return action, amount   # action returned here is sent to the poker engine

    def receive_game_start_message(self, game_info):
        pass

    def receive_round_start_message(self, round_count, hole_card, seats):

        if self.count == 0:
          self.actions = []
          self.states = []
          self.rewards = []

        if (seats[0]['name'] == self.name):
          self.startstack = seats[0]['stack']

        if (seats[1]['name'] == self.name):
          self.startstack = seats[1]['stack']

    def receive_street_start_message(self, street, round_state):
        pass

    def receive_game_update_message(self, action, round_state):
        pass

    def receive_round_result_message(self, winners, hand_info, round_state):

        if len(self.rewards) == 0:
          return

        if not self.rewards[len(self.rewards)-1] == 0:
          return

        seats = round_state['seats']

        if (seats[0]['name'] == self.name):

          if round_state['small_blind_pos'] == 0:
            self.startstack += 1
          else:
            self.startstack += 2

          self.endstack = seats[0]['stack']
          self.rewards[len(self.rewards)-1] = (self.endstack - self.startstack) * learningrate

        if (seats[1]['name'] == self.name):

          if round_state['small_blind_pos'] == 1:
            self.startstack += 1
          else:
            self.startstack += 2

          self.endstack = seats[1]['stack']
          self.rewards[len(self.rewards)-1] = (self.endstack - self.startstack) * learningrate


        print("p2 reward: ")
        print(self.rewards)

        print("p2 actions map: ")
        print(totalactions1)

        if self.count > 96:

          with tf.GradientTape() as tape:

            myloss = model2.loss(self.states, self.actions, self.rewards)

          gradients = tape.gradient(myloss, model2.trainable_variables)
          model2.optimizer.apply_gradients(zip(gradients, model2.trainable_variables))

          self.count = 0

        else:

          self.count += 1

    def __init__(self, n):

      self.name = n

from pypokerengine.api.game import setup_config, start_poker

config = setup_config(max_round=50, initial_stack=200, small_blind_amount=1)
config.register_player(name="p1", algorithm=FishPlayer("p1"))
config.register_player(name="p2", algorithm=FishPlayer2("p2"))

for i in range(0, 25000):
  game_result = start_poker(config, verbose=1)

  print("ROUND : " + str(i))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TOTALNETGAIN p1: 
3963.2107861393
p2 reward: 
[-0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01, -0.005, -0.005, -0.01, -0.005, -0.005, -0.01, -0.005, -0.01, -0.005, -0.01]
p2 actions map: 
{'call': 40563, 'fold': 663156, 'raise': 131173}
Started the round 21
Street "preflop" started. (community card = [])
[9.9602950e-01 2.3854910e-04 5.7090941e-04 1.1586446e-03 1.2210511e-04
 1.8803807e-03]
p2 probs: 
[9.9602938e-01 2.3854907e-04 5.7090935e-04 1.1586444e-03 1.2210509e-04
 1.8803804e-03]
p2 hole cards: 
['DK', 'SJ']
[{'action': 'fold', 'amount': 0}, {'action': 'call', 'amount': 2}, {'action': 'raise', 'amount': {'min': 3, 'max': 176}}]
"p2" declared "fold:0"
"['p1']" won the round 21 (stack = {'p1': 222.5, 'p2': 175})
p2 reward: 
[-0.005, -0.01, -0.005, -0.01, -0.0