### BlackJack: N-Step TD for state-value update
---
- the policy remains the same
- dealer policy: HIT17
- player policy: sticks on 20 or 21, otherwise hit
---

In [5]:
import numpy as np
import pickle

In [6]:
class BlackJack_N_step(object):
    
    def __init__(self):
        self.player_state_value = {}
        self.player_states = []
        self.player_win = 0
        self.player_draw = 0
        self.rewards = []
        self.lr = 0.1
        self.max_T = []
    
    # give card
    # giveCard() function is borrowed from BlackJack base code
    @staticmethod
    def giveCard():
        # 1 stands for ace
        c_list = list(range(1, 11)) + [10, 10, 10]
        return np.random.choice(c_list)
    
    # dealerPolicy() function is borrowed from BlackJack base code
    def dealerPolicy(self, current_value, usable_ace, is_end):
        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                return current_value, usable_ace, True
        # HIT17
        if current_value >= 17:
            return current_value, usable_ace, True
        else:
            card = self.giveCard()
            if card == 1:
                if current_value <= 10:
                    return current_value+11, True, False
                return current_value+1, usable_ace, False
            else:
                return current_value+card, usable_ace, False
            
    # one can only has 1 usable ace   
    # playerPolicy() function is borrowed from BlackJack base code            
    def playerPolicy(self, current_value, usable_ace, is_end):
        """_summary_

        Args:
            current_value (_type_): _description_
            usable_ace (_type_): _description_
            is_end (bool): _description_

        Returns:
            state: return the next state
            reward: temporal reward
        """
        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                return (current_value, usable_ace, True),-1
        # HIT20
        if current_value >= 20:
            return (current_value, usable_ace, True),0
        else:
            card = self.giveCard()
            if card == 1:
                if current_value <= 10:
                    return (current_value+11, True, False),0
                return (current_value+1, usable_ace, False),0
            else:
                return (current_value+card, usable_ace, False),0
            
        
    def reset(self):
        self.player_state_action = []
        self.rewards = []
        self.state = (0, 0, False)  # initial state
        self.end = False
        
    # modified to implement N-step TD method
    def _giveCredit(self,player_value, dealer_value, is_end, n=3, gamma=0.9):
        # update state value and win count
        if is_end:
            # give reward only to last state
            last_state = self.player_states[-1]
            if player_value > 21:
                if dealer_value > 21:
                    # draw
                    self.player_draw += 1
                else:
                    self.player_state_value[last_state] -= 1
            else:
                if dealer_value > 21:
                    self.player_state_value[last_state] += 1
                    self.player_win += 1
                else:
                    if player_value < dealer_value:
                        self.player_state_value[last_state] -= 1
                    elif player_value > dealer_value:
                        self.player_state_value[last_state] += 1
                        self.player_win += 1
                    else:
                        # draw
                        self.player_draw += 1
                        
        T = len(self.rewards) - 1
        self.max_T.append(T)
        for timestep in range(T + 1):
            tau = timestep - n + 1  # Time to update
            if tau >= 0:
                # Initialize returns with zero for safety
                returns = 0
                # Calculate the sum of the rewards from time tau to tau+n or end of game
                for i in range(tau, min(tau + n, T + 1)):
                    returns += gamma ** (i - tau) * self.rewards[i]
                # Add value of the state at tau+n if it exists
                if tau + n <= T:
                    returns += gamma ** n * self.player_state_value.get(self.player_states[tau + n], 0)
                
                # Update the value for the state at time tau
                state_to_update = self.player_states[tau]
                # Ensure the state is in the dictionary before updating
                if state_to_update in self.player_state_value:
                    self.player_state_value[state_to_update] += self.lr * (returns - self.player_state_value[state_to_update])
                else:
                    self.player_state_value[state_to_update] = self.lr * returns

            
                
            
    def play(self, rounds=1000):
        for i in range(rounds):
            self.reset()
            if i % 1000 == 0:
                print("round", i)
            # hit 2 cards each
            dealer_value, player_value = 0, 0
            show_card = 0

            # give dealer 2 cards and show 1
            dealer_value += self.giveCard()
            show_card = dealer_value
            dealer_value += self.giveCard()

            # player's turn
            # always hit if less than 12
            usable_ace, is_end = False, False
            while True:
                state,reward = self.playerPolicy(player_value, usable_ace, is_end)
                player_value, usable_ace, is_end = state
                
                # when value goes higher than 12, record states
                if (player_value >= 12) and (player_value <= 21):
                    self.player_states.append((player_value, show_card, usable_ace))
                    self.rewards.append(reward)
                    
                if is_end: 
                    break
            # print("player card sum", player_value)

            # dealer's turn
            usable_ace, is_end = False, False
            while not is_end:
                dealer_value, usable_ace, is_end = self.dealerPolicy(dealer_value, usable_ace, is_end)
            # print("dealer card sum", dealer_value)

            # judge winner
            # set intermediate state to 0
            for s in self.player_states:
                self.player_state_value[s] = 0 if self.player_state_value.get(s) is None else self.player_state_value.get(s)

            self._giveCredit(player_value,dealer_value,is_end)      
            

In [8]:
b = BlackJack_N_step()
b.play(10000)
print("the maximum length of a game is: ",sum(b.max_T)/len(b.max_T))

round 0
round 1000
round 2000
round 3000
round 4000
round 5000
round 6000
round 7000
round 8000
round 9000
0.8659


In [23]:
print("Player wining rate", b.player_win/10000)
print("Not losing rate", (b.player_win+b.player_draw)/10000)

Player wining rate 0.3049
Not losing rate 0.5492
