In [1]:
import numpy as np
import pickle

In [143]:
class BlackJackSolution:
    
    def __init__(self, lr=0.1, exp_rate=0.3):
        self.player_Q_Values = {}  # key: [(player_value, show_card, usable_ace)][action] = value
        # initialise Q values | (12-21) x (1-10) x (True, False) x (1, 0) 400 in total
        for i in range(12, 22):
            for j in range(1, 11):
                for k in [True, False]:
                    self.player_Q_Values[(i, j, k)] = {}
                    for a in [1, 0]:
                        self.player_Q_Values[(i, j, k)][a] = 0
        
        self.player_state_action = []
        self.state = (0, 0, False)  # initial state
        self.actions = [1, 0]  # 1: HIT  0: STAND
        self.end = False
        self.lr = lr
        self.exp_rate = exp_rate
    
    # give card
    @staticmethod
    def giveCard():
        # 1 stands for ace
        c_list = list(range(1, 11)) + [10, 10, 10]
        return np.random.choice(c_list)
    
    def dealerPolicy(self, current_value, usable_ace, is_end):
        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                return current_value, usable_ace, True
        # HIT17
        if current_value >= 17:
            return current_value, usable_ace, True
        else:
            card = self.giveCard()
            if card == 1:
                if current_value <= 10:
                    return current_value+11, True, False
                return current_value+1, usable_ace, False
            else:
                return current_value+card, usable_ace, False
            
    def chooseAction(self):
        # if current value <= 11, always hit
        current_value = self.state[0]
        if current_value <= 11:
            return 1
        
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            v = -999
            action = 0
            for a in self.player_Q_Values[self.state]:
                if self.player_Q_Values[self.state][a] > v:
                    action = a
                    v = self.player_Q_Values[self.state][a]
        return action
            
    # one can only has 1 usable ace 
    # return next state
    def playerNxtState(self, action):
        current_value = self.state[0]
        show_card = self.state[1]
        usable_ace = self.state[2]
        
        if action:
            # action hit
            card = self.giveCard()
            if card == 1:
                if current_value <= 10:
                    current_value += 11
                    usable_ace = True
                else:
                    current_value += 1
            else:
                current_value += card
        else:
            # action stand
            self.end = True
            return (current_value, show_card, usable_ace)
        
        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                self.end = True
                return (current_value, show_card, usable_ace)
        
        return (current_value, show_card, usable_ace)
        
    def winner(self, player_value, dealer_value):
        # player 1 | draw 0 | dealer -1
        winner = 0
        if player_value > 21:
            if dealer_value > 21:
                # draw
                winner = 0
            else:
                winner = -1
        else:
            if dealer_value > 21:
                winner = 1
            else:
                if player_value < dealer_value:
                    winner = -1
                elif player_value > dealer_value:
                    winner = 1
                else:
                    # draw
                    winner = 0
        return winner
    
    def _giveCredit(self, player_value, dealer_value):
        reward = self.winner(player_value, dealer_value)
        # backpropagate reward
        for s in reversed(self.player_state_action):
            state, action = s[0], s[1]
            reward = self.lr*(reward - self.player_Q_Values[state][action])
            self.player_Q_Values[state][action] += reward
            
    def reset(self):
        self.player_state_action = []
        self.state = (0, 0, False)  # initial state
        self.end = False
        
    def deal2cards(self, show=False):
        # return value after 2 cards and usable ace
        value, usable_ace = 0, False
        cards = [self.giveCard(), self.giveCard()]
        if 1 in cards:
            value = sum(cards)+10
            usable_ace = True
        else:
            value = sum(cards)
            usable_ace = False
        
        if show:
            return value, usable_ace, cards[0]
        else:
            return value, usable_ace

    def play(self, rounds=1000):
        for i in range(rounds):
            if i % 1000 == 0:
                print("round", i)
                
            # give 2 cards
            dealer_value, d_usable_ace, show_card = self.deal2cards(show=True)
            player_value, p_usable_ace = self.deal2cards(show=False)
            
            self.state = (player_value, show_card, p_usable_ace)
            
            # judge winner after 2 cards
            if player_value == 21 or dealer_value == 21:
                # game end
                # print("reach 21 in 2 cards: player value {} | dealer value {}".format(player_value, dealer_value))
                next
            else:
                while True:
                    action = self.chooseAction()  # state -> action 
                    # print("current value {}, action {}".format(self.state[0], action))
                    if self.state[0] >= 12:
                        state_action_pair = [self.state, action]
                        # print(state_action_pair)
                        self.player_state_action.append(state_action_pair)
                    # update next state
                    self.state = self.playerNxtState(action)
#                     print(self.state)
                    if self.end:
                        break    

                # dealer's turn
                is_end = False
                while not is_end:
                    dealer_value, d_usable_ace, is_end = self.dealerPolicy(dealer_value, d_usable_ace, is_end)

                # judge winner
                # give reward and update Q value
                player_value = self.state[0]
                # print("player value {} | dealer value {}".format(player_value, dealer_value))
                self._giveCredit(player_value, dealer_value)
            # print("player state action", self.player_state_action)
            self.reset()
            
    def savePolicy(self, file="policy"):
        fw = open(file, 'wb')
        pickle.dump(self.player_Q_Values, fw)
        fw.close()

    def loadPolicy(self, file="policy"):
        fr = open(file,'rb')
        self.player_Q_Values = pickle.load(fr)
        fr.close()
        
    # trained robot play against dealer
    def playWithDealer(self, rounds=1000):
        self.reset()
        self.loadPolicy()
        self.exp_rate = 0
        
        result = np.zeros(3)  # player [win, draw, lose]
        for _ in range(rounds):
            # hit 2 cards each
            # give 2 cards
            dealer_value, d_usable_ace, show_card = self.deal2cards(show=True)
            player_value, p_usable_ace = self.deal2cards(show=False)
            
            self.state = (player_value, show_card, p_usable_ace)
            
            # judge winner after 2 cards
            if player_value == 21 or dealer_value == 21:
                if player_value == dealer_value:
                    result[1] += 1
                elif player_value > dealer_value:
                    result[0] += 1
                else:
                    result[2] += 1
            else:
                # player's turn
                while True:
                    action = self.chooseAction()
                    # update next state
                    self.state = self.playerNxtState(action)
                    if self.end:
                        break    

                # dealer's turn
                is_end = False
                while not is_end:
                    dealer_value, d_usable_ace, is_end = self.dealerPolicy(dealer_value, d_usable_ace, is_end)

                # judge
                player_value = self.state[0]
                # print("player value {} | dealer value {}".format(player_value, dealer_value))
                w = self.winner(player_value, dealer_value)
                if w == 1:
                    result[0] += 1
                elif w == 0:
                    result[1] += 1
                else:
                    result[2] += 1
            self.reset()
        return result

In [14]:
# winner test
for _ in range(10):
    p_value = np.random.choice(range(12, 30))
    d_value = np.random.choice(range(12, 30))
    r = b.winner(p_value, d_value)
    print(p_value, d_value, r)

18 16 1
22 29 0
16 25 1
13 26 1
21 18 1
14 23 1
13 18 -1
29 29 0
17 18 -1
28 27 0


In [26]:
# test dealer policy
for _ in range(10):
    print("------------------")
    dealer_value, d_usable_ace, is_end = 0, False, False
    while not is_end:
        dealer_value, d_usable_ace, is_end = b.dealerPolicy(dealer_value, d_usable_ace, is_end)
        print(dealer_value, d_usable_ace)

------------------
4 False
14 False
15 False
16 False
20 False
20 False
------------------
3 False
10 False
19 False
19 False
------------------
9 False
11 False
17 False
17 False
------------------
7 False
16 False
18 False
18 False
------------------
5 False
15 False
25 False
25 False
------------------
11 True
17 True
17 True
------------------
7 False
11 False
14 False
24 False
24 False
------------------
10 False
16 False
24 False
24 False
------------------
7 False
13 False
19 False
19 False
------------------
6 False
9 False
16 False
26 False
26 False


In [71]:
# test deal2cards
i, j = 0, 0
for _ in range(1000):
    p, _ = b.deal2cards()
    d, _ = b.deal2cards()
    if p == 21:
        i += 1
    if d == 21:
        j += 1
print(i, j)

58 48


In [106]:
# test next state
b = BlackJackSolution()
b.state = (19, 10, True)
print(b.playerNxtState(action=1))
print(b.end)

b = BlackJackSolution()
b.state = (11, 10, True)
print(b.playerNxtState(action=1))
print(b.end)

get card 5
(14, 10, False)
False
get card 10
(21, 10, True)
False


In [110]:
# test play
b = BlackJackSolution()
b.play(10)

round 0
[(17, 8, False), 1]
(18, 8, False)
[(18, 8, False), 1]
(21, 8, False)
[(21, 8, False), 1]
(27, 8, False)
[(19, 8, False), 1]
(22, 8, False)
[(17, 10, False), 1]
(22, 10, False)
(13, 10, False)
[(13, 10, False), 1]
(17, 10, False)
[(17, 10, False), 0]
(17, 10, False)
[(16, 10, True), 1]
(14, 10, False)
[(14, 10, False), 1]
(17, 10, False)
[(17, 10, False), 0]
(17, 10, False)
[(16, 10, False), 1]
(26, 10, False)
[(15, 7, False), 1]
(25, 7, False)
[(15, 10, False), 1]
(16, 10, False)
[(16, 10, False), 0]
(16, 10, False)
[(16, 8, False), 1]
(26, 8, False)
[(15, 4, False), 1]
(25, 4, False)


### Play

In [169]:
b = BlackJackSolution(exp_rate=0.1, lr=0.2)
b.play(50000)
b.savePolicy()

round 0
round 1000
round 2000
round 3000
round 4000
round 5000
round 6000
round 7000
round 8000
round 9000
round 10000
round 11000
round 12000
round 13000
round 14000
round 15000
round 16000
round 17000
round 18000
round 19000
round 20000
round 21000
round 22000
round 23000
round 24000
round 25000
round 26000
round 27000
round 28000
round 29000
round 30000
round 31000
round 32000
round 33000
round 34000
round 35000
round 36000
round 37000
round 38000
round 39000
round 40000
round 41000
round 42000
round 43000
round 44000
round 45000
round 46000
round 47000
round 48000
round 49000


In [170]:
b.playWithDealer(10000)

array([4152., 1694., 4154.])

In [171]:
for k, v in b.player_Q_Values.items():
    actions = b.player_Q_Values.get(k)
    action = max(actions.keys(), key=lambda k: actions[k])
    action = "HIT" if action == 1 else "STAND"
    print(k, action)

(12, 1, True) HIT
(12, 1, False) HIT
(12, 2, True) HIT
(12, 2, False) HIT
(12, 3, True) HIT
(12, 3, False) STAND
(12, 4, True) HIT
(12, 4, False) HIT
(12, 5, True) HIT
(12, 5, False) HIT
(12, 6, True) HIT
(12, 6, False) HIT
(12, 7, True) HIT
(12, 7, False) HIT
(12, 8, True) HIT
(12, 8, False) HIT
(12, 9, True) HIT
(12, 9, False) HIT
(12, 10, True) HIT
(12, 10, False) HIT
(13, 1, True) HIT
(13, 1, False) HIT
(13, 2, True) HIT
(13, 2, False) HIT
(13, 3, True) HIT
(13, 3, False) HIT
(13, 4, True) STAND
(13, 4, False) HIT
(13, 5, True) HIT
(13, 5, False) STAND
(13, 6, True) HIT
(13, 6, False) HIT
(13, 7, True) HIT
(13, 7, False) HIT
(13, 8, True) HIT
(13, 8, False) HIT
(13, 9, True) HIT
(13, 9, False) HIT
(13, 10, True) HIT
(13, 10, False) HIT
(14, 1, True) HIT
(14, 1, False) HIT
(14, 2, True) HIT
(14, 2, False) HIT
(14, 3, True) HIT
(14, 3, False) HIT
(14, 4, True) HIT
(14, 4, False) HIT
(14, 5, True) HIT
(14, 5, False) HIT
(14, 6, True) HIT
(14, 6, False) STAND
(14, 7, True) HIT
(14, 7, 

In [156]:
(14, 5, False) (15, 3, False) (15, 4, False) (16, 3, False)

{1: -0.003076008940818286, 0: 0}

In [None]:
(17, 9, False) (19, 2, True)

In [173]:
b.player_Q_Values.get((19, 5, True)) 

{1: 0.006059485993377533, 0: -0.03797024795655696}