In Part 4, we went further in training the model to play with two actions. We have an optimal policy saved from the Basic Strategy for 1 deck found on Wizard of Odds, adjusted to just HIT and STAND actions -- without splitting or doubling down.

We tried to measure the performance of the training algorithm by itself, by using culumative reward, discounted reward-to-go, RMS of the policy to see if it changed much over time, etc. Nothing was really definitive, although seeing (what I think is) discounted reward-to-go flatten out over time was encouraging; plus, it flattened out on the negative side, so it showed that the generated policy is definitely not a winner. We showed that anyway in Part 3.

The best measurement of success is probably going to be playing with the caclculated model.

Let's refactor what we have and expand it.
* Create a new method for getAction in the Player class. This correlates to the strategy a player will take after training is over. By default, it's random action
* Create two subclasses for Player: QPolicyPlayer, and OPolicyPlayer, for players who will follow the Q(s,a) policy and the basic strategy respectively.
* Move metrics into the Player class
* Add multi-players to the training loop. Goal: 3
    * Put last state, action into Player class
* Design runs strategy to average results

Nice to have
* Create a class for policies? Right now, they're global variables.


In [None]:
%matplotlib inline

In [1]:
# %load utilities.py
import random
import logging

def hit(p,s):
    p.receive(s.draw())
    logging.debug("New hand: {} ({})".format(p.hand,p.getPoints()))

def newHand(P,s):
    for p in P:
        p.reset()
    deal(P,s)

# Deal
def deal(P,s):
    for i in range(2):
        for p in P:
            p.receive(s.draw())
    
# Updated getAction() using exploration/exploitation
def getTrainAction(Q,N,s):
    e = epsilon/(epsilon + N[s])
    rr = random.random()
    if rr < e:
        logging.debug("Exploring (N[s] = {}, e = {}, rr = {}): Random action selected".format(N[s],e,rr))
        return ACTIONS[random.randint(0,1)]
    else:
        # Use what we've learned
        logging.debug("Exploiting (N[s] = {}, e = {}, rr = {}): Optimal action selected".format(N[s],e,rr))
        if Q[s+('HIT',)] > Q[s+('STAND',)]:
            return 'HIT'
        else:
            return 'STAND'

# Updated Q(s,a) value
def getUpdatedQsa(Q,sa,r,s1,A):
    logging.debug("Updating Q{} {} ...".format(sa,Q[sa]))
    q = Q[sa] + 0.08*(r + discount*max(Q[s1+A[0:1]],Q[s1+A[1:2]]) - Q[sa])
    logging.debug("Updated Q{} {} ...".format(sa,q))
    return q

def getReward(p,d):
    if p.getPoints() > 21:
        return -1

    # Blackjack?
    if d.hasBlackjack():
        if p.hasBlackjack():
            return 0
        else:
            return -1
    elif p.hasBlackjack():
        return 1.5
    elif d.getPoints() > 21:
        return 1
    elif p.getPoints() > d.getPoints():
        return 1
    elif p.getPoints() == d.getPoints():
        return 0
    else:
        return -1

Overwriting utilities.py


In [2]:
# %load player.py
# Player class
import random

ACTIONS = ['HIT','STAND']

class Player:
    def __init__(self,name,bankRoll=0):
        self.name = name
        self.bankRoll = bankRoll
        self.wallet = []
        self.wins = 0
        self.losses = 0
        self.pushes = 0
        self.cumReward = [0]
        self.reset()

    def reset(self):
        self.hand = []
        self.lastState = ()
        self.lastAction = ''
        
    def receive(self,card):
        self.hand += [card]
        
    def getPoints(self):
        points = sum(self.hand)
        if points <= 21:
            return points

        while (points > 21) and (11 in self.hand):
            self.hand[self.hand.index(11)] = 1
            points = sum(self.hand)

        return points
    
    def hasBlackjack(self):
        return (self.getPoints() == 21 and len(self.hand) == 2)
    
    # Default action for a player is random
    def getAction(self,_):
        return ACTIONS[random.randint(0,len(ACTIONS)-1)]

class QPolicyPlayer(Player):
    def getAction(self,s):
        qas = {q: Q[q] for q in Q.keys() if q[0:2] == s}
        sa = max(qas.keys(), key=lambda k: qas[k])
        return sa[-1]
    
class OPolicyPlayer(Player):
    def getAction(self,s):
        return BASIC[s]
    

Overwriting player.py


In [5]:
# %load load_strategy.py
import csv

BASIC = {}
f = open('BasicStrategy_1.csv','r')
reader = csv.reader(f)
for line in reader:
    BASIC[(int(line[0]),int(line[1]))] = line[2]
f.close()

print("Basic strategy loaded into BASIC")

Overwriting load_strategy.py


In [3]:
# %load main.py
#from player import Player
from shoe import Shoe
#from utilities import hit, newHand, deal, getReward, getAction, getUpdatedQsa
from collections import defaultdict
from IPython.display import clear_output

#import logging
#logger = logging.getLogger()
#fhandler = logging.FileHandler(filename='Training.log', mode='a')
#formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#fhandler.setFormatter(formatter)
#logger.addHandler(fhandler)
#logger.setLevel(logging.DEBUG)

# Initialize
# Game elements
shoe = Shoe(1)
dealer = Player("Dealer")
players = [Player("Player 1")]
players.append(Player("Player 2"))

ACTIONS=('HIT','STAND',)

# Learning functions
Q = defaultdict(float)
N = defaultdict(float)

# Learning variables
epsilon = 10
lr = 0.08
discount = 0.99

#logger.removeHandler(fhandler)

Overwriting main.py


In [4]:
# %load training.py
import matplotlib.pyplot as plt
import numpy as np
import random
import logging

logger = logging.getLogger()
fhandler = logging.FileHandler(filename='Training.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.INFO)

# Intitalize first!
# Train with a number of hands
wins = 0
losses = 0
pushes = 0
R=[0]
n = 100000

# print("Starting bankroll: {}".format(bankroll))
# Training
for i in range(n):
    # New episode
    logging.debug("==== Episode {} ====".format(i))
    newHand([dealer]+players,shoe)
    logging.debug("Dealer's hand: {} ({})".format(dealer.hand,dealer.getPoints()))
    
    for p in players:
        logging.debug("{}'s hand: {} ({})".format(p.name,p.hand,p.getPoints()))
        p.lastState = (p.getPoints(),dealer.hand[0],)
        done = False
        while not done:
            s = p.lastState

            # choose an action
            logging.debug("Current state: {}".format(s))
            a = getTrainAction(Q,N,s)
            logging.debug("{} {}: ".format(p.name,a))

            # Take the action
            if a == 'HIT':
                hit(p,shoe)
                if p.getPoints() > 21:
                    done = True
            else:
                done = True

            # Update the intermediary Q(s,a)
            if not done:
                r = 0
                s1 = (p.getPoints(),dealer.hand[0],)

                # Update Q(s,a)
                Q[s+(a,)] = getUpdatedQsa(Q,s+(a,),r,s1,ACTIONS)
                N[s] += 1
                p.lastState = s1
                p.lastAction = a

            # This player has reached her terminal state
    
    # All players stand or busted; play out the dealer
    while dealer.getPoints() < 17:
        logging.debug("Dealer HIT: ")
        hit(dealer,shoe)
    logging.debug("Dealer STAND: ")

    # Update final Q(s,a)
    for p in players:
        r = getReward(p,dealer)
        s = p.lastState
        s1 = (p.getPoints(),dealer.hand[0],)
        Q[s+(a,)] = getUpdatedQsa(Q,s+(a,),r,s1,ACTIONS)
        N[s] += 1

        # Metrics
        if r < 0:
            logging.debug("Lose ({})".format(r))
            losses += 1
        elif r > 0:
            logging.debug("Win ({})".format(r))
            wins += 1
        else:
            logging.debug("Push ({})".format(r))
            pushes += 1

print("\n\tWins: {} %".format(100*(wins/(wins+losses+pushes))))

logger.removeHandler(fhandler)


Overwriting training.py


In [6]:
# %load playing.py
import matplotlib.pyplot as plt
import numpy as np
import random
from IPython.core.debugger import Pdb
pdb = Pdb()
#pdb.set_trace()

import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='Playing.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.INFO)

# Intitalize first!
# Play 100,000 hands
bankroll = 1000.00
wins = 0
losses = 0
pushes = 0
WALLET=[]
R=[0]
n = 1000
dealer = Player("Dealer")
players=[QPolicyPlayer("Q Player",1000.0)]
players.append(OPolicyPlayer("Basic Player",1000.0))
players.append(Player("Random Player",1000.0))

#player1 = QPolicyPlayer("Q-Player",1000.0)
#player2 = OPolicyPlayer("BasicPlayer",1000.0)

# Play hands for both players simultaneously

for i in range(n):
    # Initialize s
    newHand([dealer]+players,shoe)
    logging.debug("Dealer's hand: {} ({})".format(dealer.hand,dealer.getPoints()))
    # All players play in turn. Rewards are calculated at the end of the hand
    for p in players:
        logging.debug("{}'s hand: {} ({})".format(p.name,p.hand,p.getPoints()))

        while True:
            s = (p.getPoints(),dealer.hand[0],)
            a = p.getAction(s)
            logging.debug("{} {}: ".format(p.name,a))
    
            if a == 'HIT':
                hit(p,shoe)
            else:
                break
            
            if p.getPoints() > 21:
                break
    
    # Players done; play out the dealer
    while dealer.getPoints() < 17:
        logging.debug("Dealer HIT: ")
        hit(dealer,shoe)

    for p in players:
        r = getReward(p,dealer)*5

        # Calculate and keep track of wins/losses
        if r < 0:
            logging.debug("Lose ({})".format(r))
            p.losses += 1
        elif r > 0:
            logging.debug("Win ({})".format(r))
            p.wins += 1
        else:
            logging.debug("Push ({})".format(r))
            p.pushes += 1

        # Data from the episode    
        p.bankRoll += r
        p.cumReward.append(p.cumReward[-1]+r)
        p.wallet.append(p.bankRoll)

print("Results over {} hands:".format(n))
fig, ax = plt.subplots()
for p in players:
    print("\n\t{} Wins: {} %\n\tOutcome: {}".format(p.name,100*(p.wins/(p.wins+p.losses+p.pushes)),p.bankRoll))
    ax.plot(p.cumReward,label=p.name)

ax.legend(loc='best')
plt.show()

#fig = plt.figure(1,figsize=(12,6))

#plt.subplot(121)
#plt.plot(CR)
#plt.title('Cumulative reward for {} hands'.format(n))

#plt.subplot(122)
#plt.plot(WALLET)
#plt.title('Bankroll over {} hands assuming $5 bet'.format(n))

#fig.tight_layout()
logger.removeHandler(fhandler)

Overwriting playing.py
