<a href="https://colab.research.google.com/github/JonathanDou/PokerAI/blob/main/Poker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import gym
import numpy as np
import tensorflow as tf

# Killing optional CPU driver warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

class Reinforce(tf.keras.Model):

    def __init__(self, state_size, state_size2, num_actions):
        """
        The Reinforce class that inherits from tf.keras.Model
        The forward pass calculates the policy for the agent given a batch of states.

        """
        super(Reinforce, self).__init__()
        self.num_actions = num_actions

        # TODO: Define network parameters and optimizer

        lr = 0.00005

        self.optimizer = tf.optimizers.Adam(learning_rate=lr)

        self.D1 = tf.keras.layers.Conv2D(filters=256, kernel_size=3, strides = (1,1), padding='same', activation = 'relu', input_shape=[state_size,state_size2, 2])

        self.D2 = tf.keras.layers.Conv2D(filters=128, kernel_size=3, strides = (1,1), padding='same', activation = 'relu')

        self.D3 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides = (1,1), padding='same', activation = 'relu')

        self.D4 = tf.keras.layers.Dense(num_actions, kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03))

    def call(self, states):
        """
        Performs the forward pass on a batch of states to generate the action probabilities.
        This returns a policy tensor of shape [episode_length, num_actions], where each row is a
        probability distribution over actions for each state.

        :param states: An [episode_length, state_size] dimensioned array
        representing the history of states of an episode
        :return: A [episode_length,num_actions] matrix representing the probability distribution over actions
        for each state in the episode
        """

        logits = self.D1(states)
        logits = self.D2(logits)
        logits = self.D3(logits)

        logits = tf.keras.layers.Flatten()(logits)

        logits = self.D4(logits)
        probs = tf.nn.softmax(logits)

        return probs

    def loss(self, states, actions, discounted_rewards):
        """
        Computes the loss for the agent. Make sure to understand the handout clearly when implementing this.

        :param states: A batch of states of shape [episode_length, state_size]
        :param actions: History of actions taken at each timestep of the episode (represented as an [episode_length] array)
        :param discounted_rewards: Discounted rewards throughout a complete episode (represented as an [episode_length] array)
        :return: loss, a Tensorflow scalar
        """

        probs = self.call(np.array(states))
        actions = tf.expand_dims(actions,1)

        logged = tf.math.log(tf.gather_nd(probs, actions, 1))

        return -tf.reduce_sum(logged * discounted_rewards)


# New Section

In [None]:
#TODO figure out action output space and state input space


#https://github.com/ishikota/PyPokerEngine
#https://ishikota.github.io/PyPokerEngine/

!pip install pypokerengine

from pypokerengine.players import BasePokerPlayer
from pypokerengine.engine.hand_evaluator import HandEvaluator
from pypokerengine.engine.card import Card
from pypokerengine.utils.game_state_utils import restore_game_state

STATE_SIZE = 4
STATE_SIZE2 = 13
ACTIONS = 3

model1 = Reinforce(STATE_SIZE, STATE_SIZE2, ACTIONS)

totalactions = {'fold': 0, 'call': 0, 'raise': 0}
totalactions1 = {'fold': 0, 'call': 0, 'raise': 0}

totalnetgain = 0

learningrate = 1

beta = 1.0

mycards = []
oppcards = []


def discount(rewards, discount_factor=.95):
    """
    Takes in a list of rewards for each timestep in an episode, and
    returns a list of the discounted rewards for each timestep, which
    are calculated by summing the rewards for each future timestep, discounted
    by how far in the future it is.
    For example, in the simple case where the episode rewards are [1, 3, 5]
    and discount_factor = .99 we would calculate:
    dr_1 = 1 + 0.99 * 3 + 0.99^2 * 5 = 8.8705
    dr_2 = 3 + 0.99 * 5 = 7.95
    dr_3 = 5
    and thus return [8.8705, 7.95 , 5].
    Refer to the slides for more details about how/why this is done.

    :param rewards: List of rewards from an episode [r_{t1},r_{t2},...]
    :param discount_factor: Gamma discounting factor to use, defaults to .99
    :returns: discounted_rewards: list containing the discounted rewards for each timestep in the original rewards list
    """
    # TODO: Compute discounted rewards

    if len(rewards) == 1:
        return rewards

    indices = np.arange(len(rewards))
    total = rewards[0]

    total = total + np.sum(rewards[1:] * np.power(discount_factor, indices[1:]))

    return np.concatenate((np.array([total]), discount(rewards[1:], discount_factor)))

class FishPlayer(BasePokerPlayer):  # Do not forget to make parent class as "BasePokerPlayer"

    name = ""
    startstack = 0
    endstack = 0

    rewards = []
    states = []
    actions = []

    count = 0

    lastbet = 0

    #  we define the logic to make an action through this method. (so this method would be the core of your AI)
    def declare_action(self, valid_actions, hole_card, round_state):

        card_ref = ['2','3','4','5','6','7','8','9','T','J','Q','K','A']

        init_array = [np.zeros((4,13)) for i in range(2)]
        for indexx, card in enumerate(round_state['community_card']):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[1][0][face] = 1
          elif card[0] == 'D':
            init_array[1][1][face] = 1
          elif card[0] == 'C':
            init_array[1][2][face] = 1
          elif card[0] == 'S':
            init_array[1][3][face] = 1

        #create flattened array

        for indexx, card in enumerate(hole_card):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[0][0][face] = 1
          elif card[0] == 'D':
            init_array[0][1][face] = 1
          elif card[0] == 'C':
            init_array[0][2][face] = 1
          elif card[0] == 'S':
            init_array[0][3][face] = 1
        #create flattened array

        state = np.array(init_array)
        #create flattened array

        #allocate bigblind/pot and small stack/pot ratio vectors
        bigblindratio = np.zeros((1,10))
        smallstackratio = np.zeros((1,10))

        '''#find big blind stack, smallest stack
        pot = round_state['pot']['main']['amount']
        bbstack = round_state['seats'][round_state['big_blind_pos']]['stack']
        smallstack = min([round_state['seats'][i]['stack'] for i in range(0,len(round_state['seats']))])

        pos = np.zeros(1)

        if round_state['big_blind_pos'] == 0:
          pos[0] = 1

        street = np.zeros(4)

        if round_state['street'] == 'preflop':
          street[0] = 1

        if round_state['street'] == 'flop':
          street[1] = 1

        if round_state['street'] == 'turn':
          street[2] = 1

        if round_state['street'] == 'river':
          street[3] = 1

        #add this information to the gamestate vector
        flattened = np.concatenate([flattened, pos, street])'''

        #get information on most recent action:
        actions = ['SMALLBLIND', 'BIGBLIND', 'CALL', 'FOLD', 'RAISE']

        '''if len(round_state['action_histories'][round_state['street']]) != 0:
          last_move = round_state['action_histories'][round_state['street']][-1]
          last_move_array = np.zeros((1,2))
          last_move_array[0][0] = actions.index(last_move['action'])
          last_move_array[0][1] = last_move['amount']

        else:
          last_move_array = np.zeros((1,2))'''

        #add information on most recent action to feature vector
        #state = np.concatenate([flattened, last_move_array.flatten()])

        #print(state)
        #print(len(state))

        #state = tf.expand_dims(state, -1)

        probs = model1.call(tf.expand_dims(state, 0))

        p2 = np.array(probs)[0]

        #print(p2)

        #p2 = np.exp(beta * p2)/np.sum(np.exp(beta * p2))
        p2 /= p2.sum()

        #print("p1 probs: ")
        print(p2)

        output = np.random.choice([0,1,2], 1, p=p2)[0]

        action = 'raise'

        amount = 0

        #print(self.name + " hole cards: ")
        #print(hole_card)
        #print(valid_actions)

        self.lastbet = valid_actions[1]['amount']

        if output == 0:

          action = 'fold'

          if valid_actions[1]['amount'] == 0 or (round_state['street'] == 'preflop' and valid_actions[1]['amount'] == 2):

            action = 'call'
            output = 1

        elif output == 1:

          action = 'call'
          amount = valid_actions[1]['amount']

        elif output == 2:

          amount = (valid_actions[2]['amount']['min']) + (round_state['pot']['main']['amount'])

        if (action == 'raise' and valid_actions[2]['amount']['max'] == -1):

          action = 'call'
          output = 1
          amount = valid_actions[1]['amount']

        if (action == "raise" and amount > valid_actions[2]['amount']['max']):
          amount = valid_actions[2]['amount']['max']

        totalactions[action] += 1


        print("ACTION: " + action)
        #print(valid_actions)
        #print(round_state)
        print(round_state['community_card'])
        self.states.append(state)
        self.actions.append(output)
        self.rewards.append(0)

        return action, amount   # action returned here is sent to the poker engine

    def receive_game_start_message(self, game_info):
        pass

    def receive_round_start_message(self, round_count, hole_card, seats):

        print(hole_card)

        global mycards
        mycards = hole_card

        self.actions = []
        self.states = []
        self.rewards = []

        if (seats[0]['name'] == self.name):
          self.startstack = seats[0]['stack']

        if (seats[1]['name'] == self.name):
          self.startstack = seats[1]['stack']

    def receive_street_start_message(self, street, round_state):
        pass

    def receive_game_update_message(self, action, round_state):
        pass

    def receive_round_result_message(self, winners, hand_info, round_state):

        if len(self.rewards) == 0:
          return
        '''
        if not self.rewards[:] == 0:
          return'''

        seats = round_state['seats']

        netgain = 0

        if (seats[0]['name'] == self.name):

          self.endstack = seats[0]['stack']

          if round_state['small_blind_pos'] == 0:
            netgain = (self.endstack - self.startstack) - 1
            #self.rewards[-1] = (self.endstack - self.startstack ) * learningrate
          else:
            netgain = (self.endstack - self.startstack) - 2
            #self.rewards[-1] = (self.endstack - self.startstack) * learningrate


        if (seats[1]['name'] == self.name):


          self.endstack = seats[1]['stack']

          if round_state['small_blind_pos'] == 1:
            netgain = (self.endstack - self.startstack) - 1
            #self.rewards[-1] = (self.endstack - self.startstack) * learningrate
          else:
            netgain = (self.endstack - self.startstack) - 2
            #self.rewards[-1] = (self.endstack - self.startstack) * learningrate

        community_cards = [Card.from_str(card) for card in round_state['community_card']]

        myscore = 0
        oppscore = 0

        if len(mycards) > 0 and len(oppcards) > 0:

          myscore = HandEvaluator.eval_hand([Card.from_str(card) for card in mycards], community_cards)
          oppscore = HandEvaluator.eval_hand([Card.from_str(card) for card in oppcards], community_cards)


        if self.actions[-1] == 0:

          if len(mycards) == 0 or len(oppcards) == 0:
            return

          if oppscore > myscore:

            self.rewards[-1] = 0.02 #self.lastbet/100

          else:
            self.rewards[-1] = (netgain/100)
        elif self.actions[-1] == 1:

          self.rewards[-1] = (netgain+2)/100

        else:
          '''if netgain > 0:
            self.rewards[-1] = 1
          elif netgain < 0:
            self.rewards[-1] = -1'''

          self.rewards[-1] = (netgain+2)/100


        print("p1 actions map: ")
        print(totalactions)
        global totalnetgain
        totalnetgain += netgain

        print("NETGAIN")
        print(netgain)
        print("TOTALNETGAIN p1: ")
        print(totalnetgain)

        with tf.GradientTape() as tape:

          discounted_rewards = discount(self.rewards)

          print("p1 rewards: ")
          print(discounted_rewards)
          myloss = model1.loss(self.states, self.actions, self.rewards)

        gradients = tape.gradient(myloss, model1.trainable_variables)
        model1.optimizer.apply_gradients(zip(gradients, model1.trainable_variables))


    def __init__(self, n):

      self.name = n

'''
class FishPlayer2(BasePokerPlayer):  # Do not forget to make parent class as "BasePokerPlayer"

    name = ""
    startstack = 0
    endstack = 0

    rewards = []
    states = []
    actions = []

    count = 0

    #  we define the logic to make an action through this method. (so this method would be the core of your AI)
    def declare_action(self, valid_actions, hole_card, round_state):
        # valid_actions format => [raise_action_info, call_action_info, fold_action_info]

        card_ref = ['2','3','4','5','6','7','8','9','T','J','Q','K','A']

        init_array = [np.zeros((4,13)) for i in range(0,5)]
        for indexx, card in enumerate(round_state['community_card']):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[indexx][0][face] = 1
          elif card[0] == 'D':
            init_array[indexx][1][face] = 1
          elif card[0] == 'C':
            init_array[indexx][2][face] = 1
          elif card[0] == 'S':
            init_array[indexx][3][face] = 1

        #create flattened array
        init_array = np.array(init_array)

        flattened = init_array.flatten()

        init_array = [np.zeros((4,13)) for i in range(0,2)]
        for indexx, card in enumerate(hole_card):
          face = card_ref.index(card[1])
          if card[0] == 'H':
            init_array[indexx][0][face] = 1
          elif card[0] == 'D':
            init_array[indexx][1][face] = 1
          elif card[0] == 'C':
            init_array[indexx][2][face] = 1
          elif card[0] == 'S':
            init_array[indexx][3][face] = 1
        #create flattened array

        init_array = np.array(init_array)

        flattened = np.concatenate([init_array.flatten(), flattened])

        #allocate bigblind/pot and small stack/pot ratio vectors
        bigblindratio = np.zeros((1,10))
        smallstackratio = np.zeros((1,10))

        #find big blind stack, smallest stack
        pot = round_state['pot']['main']['amount']
        bbstack = round_state['seats'][round_state['big_blind_pos']]['stack']
        smallstack = min([round_state['seats'][i]['stack'] for i in range(0,len(round_state['seats']))])

        #add this information to the gamestate vector
        #flattened = np.concatenate([flattened, np.array([pot, smallstack])])

        #get information on most recent action:
        action_set = ['SMALLBLIND', 'BIGBLIND', 'CALL', 'FOLD', 'RAISE']

        if len(round_state['action_histories'][round_state['street']]) != 0:
          last_move = round_state['action_histories'][round_state['street']][-1]
          last_move_array = np.zeros((1,2))
          last_move_array[0][0] = action_set.index(last_move['action'])
          last_move_array[0][1] = last_move['amount']

        else:
          last_move_array = np.zeros((1,2))

        #add information on most recent action to feature vector

        pos = np.zeros(1)

        if round_state['big_blind_pos'] == 1:
          pos[0] = 1

        street = np.zeros(4)

        if round_state['street'] == 'preflop':
          street[0] = 1

        if round_state['street'] == 'flop':
          street[1] = 1

        if round_state['street'] == 'turn':
          street[2] = 1

        if round_state['street'] == 'river':
          street[3] = 1

        #add this information to the gamestate vector
        state = np.concatenate([flattened, pos, street])

        probs = model2.call(tf.expand_dims(state, 0))[0]

        p2 = np.array(probs)

        print(p2)
        #p2 = np.exp(beta * p2)/np.sum(np.exp(beta * p2))
        p2 /= p2.sum()

        print("p2 probs: ")
        print(p2)

        output = np.random.choice([0,1,2,3,4,5], 1, p=p2)

        action = 'raise'
        amount = 0

        print(self.name + " hole cards: ")
        print(hole_card)
        print(valid_actions)

        if output == 0:

          action = 'fold'

          if valid_actions[1]['amount'] == 0:

            action = 'call'

        elif output == 1:

          action = 'call'
          amount = valid_actions[1]['amount']

        elif output == 2:

          amount = (valid_actions[2]['amount']['min']) + (5 * round_state['pot']['main']['amount'])

        if (action == 'raise' and valid_actions[2]['amount']['max'] == -1):

          action = 'call'
          amount = valid_actions[1]['amount']

        if (action == "raise" and amount > valid_actions[2]['amount']['max']):
          amount = valid_actions[2]['amount']['max']

        totalactions1[action] += 1

        self.states.append(state)
        self.actions.append(output)
        self.rewards.append(0)

        return action, amount   # action returned here is sent to the poker engine

    def receive_game_start_message(self, game_info):
        pass

    def receive_round_start_message(self, round_count, hole_card, seats):

        if self.count == 0:
          self.actions = []
          self.states = []
          self.rewards = []

        if (seats[0]['name'] == self.name):
          self.startstack = seats[0]['stack']

        if (seats[1]['name'] == self.name):
          self.startstack = seats[1]['stack']

    def receive_street_start_message(self, street, round_state):
        pass

    def receive_game_update_message(self, action, round_state):
        pass

    def receive_round_result_message(self, winners, hand_info, round_state):

        if len(self.rewards) == 0:
          return

        if not self.rewards[len(self.rewards)-1] == 0:
          return

        seats = round_state['seats']

        if (seats[0]['name'] == self.name):

          if round_state['small_blind_pos'] == 0:
            self.startstack += 1
          else:
            self.startstack += 2

          self.endstack = seats[0]['stack']
          self.rewards[:] = (self.endstack - self.startstack) * learningrate

        if (seats[1]['name'] == self.name):

          if round_state['small_blind_pos'] == 1:
            self.startstack += 1
          else:
            self.startstack += 2

          self.endstack = seats[1]['stack']
          self.rewards[:] = (self.endstack - self.startstack) * learningrate


        print("p2 reward: ")
        print(self.rewards)

        print("p2 actions map: ")
        print(totalactions1)

        if self.count > 96:

          with tf.GradientTape() as tape:

            myloss = model2.loss(self.states, self.actions, self.rewards)

          gradients = tape.gradient(myloss, model2.trainable_variables)
          model2.optimizer.apply_gradients(zip(gradients, model2.trainable_variables))

          self.count = 0

        else:

          self.count += 1

    def __init__(self, n):

      self.name = n
'''



'\nclass FishPlayer2(BasePokerPlayer):  # Do not forget to make parent class as "BasePokerPlayer"\n\n    name = ""\n    startstack = 0\n    endstack = 0\n\n    rewards = []\n    states = []\n    actions = []\n\n    count = 0\n\n    #  we define the logic to make an action through this method. (so this method would be the core of your AI)\n    def declare_action(self, valid_actions, hole_card, round_state):\n        # valid_actions format => [raise_action_info, call_action_info, fold_action_info]\n\n        card_ref = [\'2\',\'3\',\'4\',\'5\',\'6\',\'7\',\'8\',\'9\',\'T\',\'J\',\'Q\',\'K\',\'A\']\n\n        init_array = [np.zeros((4,13)) for i in range(0,5)]\n        for indexx, card in enumerate(round_state[\'community_card\']):\n          face = card_ref.index(card[1])\n          if card[0] == \'H\':\n            init_array[indexx][0][face] = 1\n          elif card[0] == \'D\':\n            init_array[indexx][1][face] = 1\n          elif card[0] == \'C\':\n            init_array[index

In [None]:
print(discount([0,0,0,0,1]))

[0.81450625 0.857375   0.9025     0.95       1.        ]


In [None]:
class BadPlayer(BasePokerPlayer):  # Do not forget to make parent class as "BasePokerPlayer"

    #  we define the logic to make an action through this method. (so this method would be the core of your AI)
    def declare_action(self, valid_actions, hole_card, round_state):
        # valid_actions format => [raise_action_info, call_action_info, fold_action_info]
        #call_action_info = valid_actions[1]

        p2 = [0.9, 0.1]
        action = "raise"

        amount = 0

        output = np.random.choice([0,1], 1, p=p2)[0]

        if output == 0:
          action = "call"
          amount = valid_actions[1]['amount']
        else:
          amount = (valid_actions[2]['amount']['min']) + (0.5 * round_state['pot']['main']['amount'])

        if (action == 'raise' and valid_actions[2]['amount']['max'] == -1):

          action = 'call'
          output = 1
          amount = valid_actions[1]['amount']

        if (action == "raise" and amount > valid_actions[2]['amount']['max']):
          amount = valid_actions[2]['amount']['max']

        #action, amount = call_action_info["action"], call_action_info["amount"]
        return action, amount   # action returned here is sent to the poker engine

    def receive_game_start_message(self, game_info):
        pass

    def receive_round_start_message(self, round_count, hole_card, seats):

        global oppcards
        oppcards = hole_card

    def receive_street_start_message(self, street, round_state):
        pass

    def receive_game_update_message(self, action, round_state):
        pass

    def receive_round_result_message(self, winners, hand_info, round_state):
        pass

In [None]:
from pypokerengine.api.game import setup_config, start_poker

config = setup_config(max_round=1, initial_stack=100, small_blind_amount=1)
config.register_player(name="p1", algorithm=FishPlayer("p1"))
config.register_player(name="p2", algorithm=BadPlayer())

for i in range(0, 25000):
  game_result = start_poker(config, verbose=0)

  print("ROUND : " + str(i))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-2
TOTALNETGAIN p1: 
-1701.0
p1 rewards: 
[0. 0. 0. 0.]
ROUND : 4983
['D8', 'S6']
[0.01208011 0.84997034 0.13794956]
ACTION: call
[]
[0.00370511 0.9049628  0.09133215]
ACTION: raise
['D5', 'H6', 'HQ']
[0.00256267 0.91917026 0.07826712]
ACTION: call
['D5', 'H6', 'HQ', 'C5']
[0.00114827 0.942328   0.05652376]
ACTION: call
['D5', 'H6', 'HQ', 'C5', 'D7']
p1 actions map: 
{'fold': 176, 'call': 12788, 'raise': 5124}
NETGAIN
8
TOTALNETGAIN p1: 
-1693.0
p1 rewards: 
[0.0857375 0.09025   0.095     0.1      ]
ROUND : 4984
['S3', 'D3']
[0.01959598 0.80542    0.17498401]
ACTION: raise
[]
[0.00645871 0.8689323  0.12460898]
ACTION: call
['H2', 'D6', 'HQ']
[0.00389194 0.8946749  0.10143318]
ACTION: call
['H2', 'D6', 'HQ', 'C6']
[0.00308417 0.9065189  0.09039693]
ACTION: call
['H2', 'D6', 'HQ', 'C6', 'S6']
p1 actions map: 
{'fold': 176, 'call': 12791, 'raise': 5125}
NETGAIN
7
TOTALNETGAIN p1: 
-1686.0
p1 rewards: 
[0.07716375 0.081225   

KeyboardInterrupt: ignored