<a href="https://colab.research.google.com/github/JohnYCLam/MachineLearning/blob/main/Reinforcement_Learning_Monte_Carlo%2C_TD_Learning_and_Function_Approximation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

@Misc{silver2015,author = {David Silver},title = {Lectures on
Reinforcement Learning},howpublished = {\textsc{url:}~\url
{https://www.davidsilver.uk/teaching/}},year = {2015}}

In [35]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import product

In [36]:
#state: tuple (dealer_sum, player_sum)
#actions: a = 1 (hit), a = 0 (stick)

def step(state, a):
    dealer_sum, player_sum = state
    if a == 1:
        next_card = (np.random.choice([1, -1], p = [2/3, 1/3]), np.random.choice(range(1, 11)))
        #print(f'player next card: {next_card[0] * next_card[1]}')
        player_sum += next_card[0] * next_card[1]

        if player_sum < 1 or player_sum > 21:
            return (dealer_sum, player_sum), -1, True

        else:
            return (dealer_sum, player_sum), 0, False


    else:
        while dealer_sum < 17:
            next_card = (np.random.choice([1, -1], p = [2/3, 1/3]), np.random.choice(range(1, 11)))
            #print(f'dealer next card: {next_card[0] * next_card[1]}')
            dealer_sum += next_card[0] * next_card[1]

            if dealer_sum < 1 or dealer_sum > 21:
                return (dealer_sum, player_sum), 1, True

            if dealer_sum >= 17:
                difference = dealer_sum - player_sum
                match difference:
                    case _ if difference < 0:
                        r = 1
                    case _ if difference == 0:
                        r = 0
                    case _ if difference > 0:
                        r = -1
                return (dealer_sum, player_sum), r, True


In [20]:
state = (np.random.choice(range(1, 11)), np.random.choice(range(1, 11)))
print(f'Initial State: {state}')
done = False
while not done:
    a = int(input("Input Next Action: "))
    state, r, done = step(state, a)
    print(f'State: {state} | reward: {r}')

Initial State: (10, 7)


KeyboardInterrupt: Interrupted by user

In [61]:
def eps_greedy_exploration(Q, state, n_s, n_0 = 100):
    eps = n_0 / (n_0 + n_s)
    p = np.random.random()
    if p < eps:
        next_a = np.random.choice([0, 1])
    else:
        next_a = Q[state].index(max(Q[state]))
    return next_a

In [68]:
def generate_episode(Q, N_sa):
    episode = []
    state = (np.random.choice(range(1, 11)), np.random.choice(range(1, 22)))
    done = False
    while not done:
        a = eps_greedy_exploration(Q, state, sum(N_sa[state]))
        next_state, r, done = step(state, a)
        episode.append((state, a, r))
        state = next_state
    return episode

In [81]:
Q = {}
N_sa = {}

for state in list(product(range(1,11), range(1, 22))):
    Q[state] = [0, 0]
    N_sa[state] = [0, 0]

In [82]:
epochs = 100000
for _ in range(epochs):
    episode = generate_episode(Q, N_sa)
    G = 0
    for state, a, r in reversed(episode):
        G += r
        N_sa[state][a] += 1
        Q[state][a] += + 1/N_sa[state][a] * (G - Q[state][a])


In [83]:
#state: tuple (dealer_sum, player_sum)
#actions: a = 0 (stick), a = 1 (hit)
Q

{(1, 1): [0.38823529411764723, -0.15294117647058822],
 (1, 2): [0.28865979381443285, -0.2551020408163265],
 (1, 3): [0.29002320185614855, -0.11340206185567009],
 (1, 4): [0.20090293453724606, -0.0352941176470588],
 (1, 5): [0.295711060948081, -0.1153846153846154],
 (1, 6): [0.37254901960784315, 0.09565217391304347],
 (1, 7): [0.3709677419354837, 0.11111111111111109],
 (1, 8): [0.17303370786516856, 0.10638297872340427],
 (1, 9): [0.31498470948012225, 0.27876106194690253],
 (1, 10): [0.3461538461538461, -6.938893903907228e-18],
 (1, 11): [0.3280632411067192, 0.3653846153846153],
 (1, 12): [0.3404710920770882, 0.2673267326732674],
 (1, 13): [0.37759336099585067, 0.10666666666666663],
 (1, 14): [0.34482758620689646, 0.061224489795918394],
 (1, 15): [0.32776617954070997, 0.12631578947368424],
 (1, 16): [0.3459119496855345, -0.1975308641975308],
 (1, 17): [0.3865546218487393, -0.0952380952380953],
 (1, 18): [0.5974842767295592, -0.032967032967032975],
 (1, 19): [0.6632860040567957, -0.292929

In [84]:
optimal_policy = {}
for state, values in Q.items():
    a = Q[state].index(max(Q[state]))
    optimal_policy[state] = a

In [85]:
optimal_policy

{(1, 1): 0,
 (1, 2): 0,
 (1, 3): 0,
 (1, 4): 0,
 (1, 5): 0,
 (1, 6): 0,
 (1, 7): 0,
 (1, 8): 0,
 (1, 9): 0,
 (1, 10): 0,
 (1, 11): 1,
 (1, 12): 0,
 (1, 13): 0,
 (1, 14): 0,
 (1, 15): 0,
 (1, 16): 0,
 (1, 17): 0,
 (1, 18): 0,
 (1, 19): 0,
 (1, 20): 0,
 (1, 21): 0,
 (2, 1): 0,
 (2, 2): 0,
 (2, 3): 0,
 (2, 4): 0,
 (2, 5): 0,
 (2, 6): 0,
 (2, 7): 0,
 (2, 8): 0,
 (2, 9): 0,
 (2, 10): 0,
 (2, 11): 0,
 (2, 12): 0,
 (2, 13): 0,
 (2, 14): 0,
 (2, 15): 0,
 (2, 16): 0,
 (2, 17): 0,
 (2, 18): 0,
 (2, 19): 0,
 (2, 20): 0,
 (2, 21): 0,
 (3, 1): 0,
 (3, 2): 0,
 (3, 3): 0,
 (3, 4): 0,
 (3, 5): 0,
 (3, 6): 0,
 (3, 7): 0,
 (3, 8): 0,
 (3, 9): 0,
 (3, 10): 1,
 (3, 11): 1,
 (3, 12): 1,
 (3, 13): 0,
 (3, 14): 0,
 (3, 15): 0,
 (3, 16): 0,
 (3, 17): 0,
 (3, 18): 0,
 (3, 19): 0,
 (3, 20): 0,
 (3, 21): 0,
 (4, 1): 0,
 (4, 2): 0,
 (4, 3): 0,
 (4, 4): 0,
 (4, 5): 0,
 (4, 6): 0,
 (4, 7): 0,
 (4, 8): 0,
 (4, 9): 0,
 (4, 10): 1,
 (4, 11): 1,
 (4, 12): 0,
 (4, 13): 0,
 (4, 14): 0,
 (4, 15): 0,
 (4, 16): 0,
 (4, 17):