In [None]:
import numpy as np
import random

In [None]:
LEARNING_RATE = .003
DISCOUNT = .85

terrain = \
'''
\                       |
 \           ___        |
  \         /   \       |
   \       /     \_/wwww|
    \_____/             |
'''
chars = ['\\', '-', '_', '/', 'w']
grid = terrain.split('\n')[1:-1]
max_length = max([len(r) for r in grid])
states = []
for a in range(max_length):
    for b in range(len(grid)):
        if grid[b][a] != ' ':
            states.append(grid[b][a])
            break

In [None]:
def phi(states, state, momentum, action, normalization_factor=10):

    phi_vector = []

    phi_vector.append(momentum/normalization_factor)

    # Action (One Hot Encoding for action)
    if action == 'acc':
        phi_vector.extend([0, 0, 1])
    elif action == 'brake':
        phi_vector.extend([1, 0, 0])
    else:
        phi_vector.extend([0, 1, 0])

    # Distance to Goal
    w_states = [i for i, s in enumerate(states) if s == 'w']
    phi_vector.append((w_states[0] - state)/normalization_factor)

    # Number of Slopes Types
    num_positive_slopes = 0
    num_negative_slopes = 0
    num_neutral_slopes = 0
    for i in range(state, w_states[0] + 1):
        if states[i] == '\\':
            num_positive_slopes += 1
        elif states[i] == '/':
            num_negative_slopes += 1
        else:
            num_neutral_slopes += 1
    phi_vector.extend([num_positive_slopes/normalization_factor, num_negative_slopes/normalization_factor, num_neutral_slopes/normalization_factor])

    # Slope Types Vs. Momentum
    if momentum > num_positive_slopes - num_negative_slopes:
        phi_vector.extend([0, 0, 1])
    elif momentum < num_positive_slopes - num_negative_slopes:
        phi_vector.extend([1, 0, 0])
    else:
        phi_vector.extend([0, 1, 0])
    phi_vector.append((momentum - (num_positive_slopes - num_negative_slopes)) / normalization_factor)
    
    # Tile encoding
    tile_encoding = [0] * 20
    index = round((state/len(states)) * (len(tile_encoding) - 1))
    value = (num_positive_slopes - num_negative_slopes)/normalization_factor
    tile_encoding[index] = value
    if index > 0:
        tile_encoding[index - 1] = value / 2
    if index < len(tile_encoding) - 1:
        tile_encoding[index + 1] = value / 2
    phi_vector.extend(tile_encoding)

    return np.array(phi_vector)


action_momentum_dict = {
    'acc': 1,
    'brake': -1,
    'neutral': 0
}
state_momentum_dict = {
    '\\': 1,
    '/': -1,
    '_': 0,
    'w': 0
}


def epsilon_greedy_policy(q_weights, states, state, momentum, epsilon=0.3):

    if random.random() < epsilon:
        return random.choice(['acc', 'brake', 'neutral'])
    else:
        best_q = float('-inf')
        best_action = []
        for a in ['acc', 'brake', 'neutral']:
            q_val = np.dot(q_weights, phi(states, state, momentum, a))
            if best_q < q_val:
                best_q = q_val
                best_action = [a]
            elif best_q == q_val:
                best_action.append(a)
        try:
            return random.choice(best_action)
        except:
            ...


def semi_gradient_SARSA(states, lr=LEARNING_RATE, d=DISCOUNT, episode_count=2000, e = .3):

    weights_len = len(phi(states, 0, 0, 0))
    q_weights = np.random.uniform(-1, 1, size=weights_len)

    decay_factor = 0.995  # Decay rate for epsilon, can be tuned
    min_epsilon = 0.01

    while episode_count > 0:
        episode_count -= 1

        print(f'Episodes Remaining: {episode_count}', end='\t')

        e = max(min_epsilon, e * decay_factor)

        s = 0
        m = 0
        a = epsilon_greedy_policy(q_weights, states, s, m)
        r = 0
    
        while True:

            # Take action A, observe R, S'
            action_m = action_momentum_dict[a]
            if states[s] == '/':
                action_m = min(0, action_m) # Car is too weak to get momentum uphill
            m_prime = m + action_m + state_momentum_dict[states[s]]
            if m_prime > 0:
                s_prime = s + 1
            else:
                s_prime = s

            fin_ep = False
            # Terminal state: Enter goal states 'w' fully stopping with momentum (m) as 0.
            if states[s_prime] == 'w' and m_prime == 0:
                print('Success')
                r = 5
                fin_ep = True
            elif states[s_prime] == '|' or m_prime < 0: # Agent overshot the terrain we provided or ran out of momentum prematurely
                print('Failure')
                r = -5
                fin_ep = True
            else:
                r = -1 * (m + 1) # Reward slower and softer movement

            # Simple SARSA implementation
            if fin_ep:
                q_weights += lr * (
                    r - np.dot(q_weights, phi(states, s, m, a))
                ) * phi(states, s, m, a)
                break
            else:
                a_prime = epsilon_greedy_policy(q_weights, states, s_prime, m_prime)
                q_weights += lr * (
                    r + (d * np.dot(q_weights, phi(states, s_prime, m_prime, a_prime))) - np.dot(q_weights, phi(states, s, m, a))
                )

                s = s_prime
                m = m_prime
                a = a_prime

semi_gradient_SARSA(states)

Episodes Remaining: 1999	Failure
Episodes Remaining: 1998	Failure
Episodes Remaining: 1997	Failure
Episodes Remaining: 1996	Failure
Episodes Remaining: 1995	Failure
Episodes Remaining: 1994	Failure
Episodes Remaining: 1993	Failure
Episodes Remaining: 1992	Failure
Episodes Remaining: 1991	Failure
Episodes Remaining: 1990	Failure
Episodes Remaining: 1989	Failure
Episodes Remaining: 1988	Failure
Episodes Remaining: 1987	Failure
Episodes Remaining: 1986	Failure
Episodes Remaining: 1985	Failure
Episodes Remaining: 1984	Failure
Episodes Remaining: 1983	Failure
Episodes Remaining: 1982	Failure
Episodes Remaining: 1981	Failure
Episodes Remaining: 1980	Failure
Episodes Remaining: 1979	Failure
Episodes Remaining: 1978	Failure
Episodes Remaining: 1977	Failure
Episodes Remaining: 1976	Failure
Episodes Remaining: 1975	Failure
Episodes Remaining: 1974	Failure
Episodes Remaining: 1973	Failure
Episodes Remaining: 1972	Failure
Episodes Remaining: 1971	Failure
Episodes Remaining: 1970	Failure
Episodes R