Time to bring it all together.

The experiment combines training and testing, so we'll need to get all of this into one loop. Without going too crazy, lets factor the pieces into methods.

I don't want to bother with passing Q, N, and other common data structures back and forth through argument lists, so I'm going to use global scope and not worry about it.

Here's the algorithm I'm shooting for:

```
Set learning rate, discount, and epsilon for the experiment
For each run
    Initialize environment
    while training runs < some number
        train the agent
        reset players
        test the agent
        capture x = N, p1 reward, p2 reward, p3 reward
        increase N
```


In [1]:
%matplotlib inline

In [2]:
# %load global_identifiers.py
# Global identifiers
from collections import defaultdict

ACTIONS=('HIT','STAND',)

# Learning functions
Q = defaultdict(float)
N = defaultdict(float)

# Learning variables
epsilon = 10
lr = 0.08
discount = 0.99



In [3]:
# %load load_strategy.py
import csv

BASIC = {}
f = open('BasicStrategy_1.csv','r')
reader = csv.reader(f)
for line in reader:
    BASIC[(int(line[0]),int(line[1]))] = line[2]
f.close()

print("Basic strategy loaded into BASIC")

Basic strategy loaded into BASIC


In [4]:
# %load training.py
import logging
import random
from utilities import *
#from global_identifiers import *

#
# Global references:
# N - the state-action counter for greedy-epsilon exploration function
# Q - the Q(s,a) function
# epsilon, lr, discount

# Get an action using exploration/exploitation based in the current state
def getTrainAction(s):
    global Q
    e = epsilon/(epsilon + N[s])
    rr = random.random()
    if rr < e:
        logging.debug("Exploring (N[s] = {}, e = {}, rr = {}): Random action selected".format(N[s],e,rr))
        return ACTIONS[random.randint(0,1)]
    else:
        # Use what we've learned
        logging.debug("Exploiting (N[s] = {}, e = {}, rr = {}): Optimal action selected".format(N[s],e,rr))
        if Q[s+('HIT',)] > Q[s+('STAND',)]:
            return 'HIT'
        else:
            return 'STAND'

# Updated Q(s,a) value
def getUpdatedQsa(sa,r,s1):
    global Q
    logging.debug("Updating Q{} {} ...".format(sa,Q[sa]))
    q = Q[sa] + 0.08*(r + discount*max(Q[s1+ACTIONS[0:1]],Q[s1+ACTIONS[1:2]]) - Q[sa])
    logging.debug("Updated Q{} {} ...".format(sa,q))
    return q


def trainQLAgent(dealer,players,shoe,n):
    global Q, N
    # Train with a number of hands
    wins = 0
    losses = 0
    pushes = 0

    # print("Starting bankroll: {}".format(bankroll))
    # Training
    for i in range(n):
        # New episode
        logging.debug("==== Episode {} ====".format(i))
        newHand([dealer]+players,shoe)
        logging.debug("Dealer's hand: {} ({})".format(dealer.hand,dealer.getPoints()))

        for p in players:
            logging.debug("{}'s hand: {} ({})".format(p.name,p.hand,p.getPoints()))
            p.lastState = (p.getPoints(),dealer.hand[0],)
            done = False
            while not done:
                s = p.lastState

                # choose an action
                logging.debug("Current state: {}".format(s))
                a = getTrainAction(s)
                logging.debug("{} {}: ".format(p.name,a))

                # Take the action
                if a == 'HIT':
                    hit(p,shoe)
                    if p.getPoints() > 21:
                        done = True
                else:
                    done = True

                # Update the intermediary Q(s,a)
                if not done:
                    r = 0
                    s1 = (p.getPoints(),dealer.hand[0],)

                    # Update Q(s,a)
                    Q[s+(a,)] = getUpdatedQsa(s+(a,),r,s1)
                    N[s] += 1
                    p.lastState = s1
                    p.lastAction = a

                # This player has reached her terminal state

        # All players stand or busted; play out the dealer
        while dealer.getPoints() < 17:
            logging.debug("Dealer HIT: ")
            hit(dealer,shoe)
        logging.debug("Dealer STAND: ")

        # Update final Q(s,a)
        for p in players:
            r = getReward(p,dealer)
            s = p.lastState
            s1 = (p.getPoints(),dealer.hand[0],)
            Q[s+(a,)] = getUpdatedQsa(s+(a,),r,s1)
            N[s] += 1

            # Metrics
            if r < 0:
                logging.debug("Lose ({})".format(r))
                losses += 1
            elif r > 0:
                logging.debug("Win ({})".format(r))
                wins += 1
            else:
                logging.debug("Push ({})".format(r))
                pushes += 1

    print("\n\tWins: {} %".format(100*(wins/(wins+losses+pushes))))


In [5]:
# %load playing.py
import matplotlib.pyplot as plt
import numpy as np
import random
#from IPython.core.debugger import Pdb
#pdb = Pdb()
#pdb.set_trace()

#import logging
#logger = logging.getLogger()
#fhandler = logging.FileHandler(filename='Playing.log', mode='w')
#formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#fhandler.setFormatter(formatter)
#logger.addHandler(fhandler)
#logger.setLevel(logging.INFO)

def testQLAgent(dealer,players,shoe,n):
    # Play hands for all players simultaneously

    for i in range(n):
        # Initialize s
        newHand([dealer]+players,shoe)
        logging.debug("Dealer's hand: {} ({})".format(dealer.hand,dealer.getPoints()))
        # All players play in turn. Rewards are calculated at the end of the hand
        for p in players:
            logging.debug("{}'s hand: {} ({})".format(p.name,p.hand,p.getPoints()))

            while True:
                s = (p.getPoints(),dealer.hand[0],)
                a = p.getAction(s)
                logging.debug("{} {}: ".format(p.name,a))

                if a == 'HIT':
                    hit(p,shoe)
                else:
                    break

                if p.getPoints() > 21:
                    break

        # Players done; play out the dealer
        while dealer.getPoints() < 17:
            logging.debug("Dealer HIT: ")
            hit(dealer,shoe)

        for p in players:
            r = getReward(p,dealer)*5

            # Calculate and keep track of wins/losses
            if r < 0:
                logging.debug("Lose ({})".format(r))
                p.losses += 1
            elif r > 0:
                logging.debug("Win ({})".format(r))
                p.wins += 1
            else:
                logging.debug("Push ({})".format(r))
                p.pushes += 1

            # Data from the episode    
            p.bankRoll += r
            p.cumReward.append(p.cumReward[-1]+r)
            p.wallet.append(p.bankRoll)

    print("Results over {} hands:".format(n))
    fig, ax = plt.subplots()
    for p in players:
        print("\n\t{} Wins: {} %\n\tOutcome: {}".format(p.name,100*(p.wins/(p.wins+p.losses+p.pushes)),p.bankRoll))
        ax.plot(p.cumReward,label=p.name)

    ax.legend(loc='best')
    plt.show()

#fig = plt.figure(1,figsize=(12,6))

#plt.subplot(121)
#plt.plot(CR)
#plt.title('Cumulative reward for {} hands'.format(n))

#plt.subplot(122)
#plt.plot(WALLET)
#plt.title('Bankroll over {} hands assuming $5 bet'.format(n))

#fig.tight_layout()
#logger.removeHandler(fhandler)

In [6]:
# %load main.py
from player import *
from shoe import Shoe
#from utilities import *
from collections import defaultdict

#import logging
#logger = logging.getLogger()
#fhandler = logging.FileHandler(filename='Experiment.log', mode='a')
#formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#fhandler.setFormatter(formatter)
#logger.addHandler(fhandler)
#logger.setLevel(logging.DEBUG)

# Global identifiers
#global Q, N, epsilon, lr, discount

# Game setup
shoe = Shoe(1)
dealer = Player("Dealer")

# Learning functions
Q = defaultdict(float)
N = defaultdict(float)

# Learning variables
epsilon = 10
lr = 0.08
discount = 0.99

trainingPlayers = [Player("Player 1")]
trainingPlayers.append(Player("Player 2"))
trainingPlayers.append(Player("Player 3"))
trainingPlayers.append(Player("Player 4"))

trainQLAgent(dealer,trainingPlayers,shoe,10000)

testingPlayers = []
testingPlayers.append(QPolicyPlayer("Q Player",1000.00))
testingPlayers.append(OPolicyPlayer("Basic Player",1000.00))
testingPlayers.append(Player("Random",1000.00))

testQLAgent(dealer,testingPlayers,shoe,1000)

#logger.removeHandler(fhandler)


	Wins: 29.447499999999998 %


NameError: name 'Q' is not defined