## Racetrack Problem
From Exercise 5.12.
This implementation is adapted from [here](https://towardsdatascience.com/solving-reinforcement-learning-racetrack-exercise-building-the-environment-33712602de0c/). This is somewhat similar to the structure of `gym_practice.py`.

In [5]:
import numpy as np
from gymnasium import Env
import matplotlib.pyplot as plt
import pickle
from race_track import RaceTrack
from scipy.ndimage import uniform_filter

In [6]:
###### Off-policy Monte Carlo Control part ######

# 90% choose the greedy act, 10% explore
def behavior_pi(state: tuple, nA: int, target_pi: any, epsilon: float) -> tuple:
    rand_val = np.random.rand()
    greedy_act = target_pi[state]

    if rand_val > epsilon:
        return greedy_act, (1 - epsilon + epsilon / nA)
    else:
        action = np.random.choice(nA)
        if action == greedy_act:
            return action, (1 - epsilon + epsilon / nA)
        else:
            return action, epsilon/nA


def off_policy_monte_carlo(total_episodes:int, track_map:str, render_mode:str, zero_acc:bool=False) -> tuple:
    gamma = 0.9 # arbitrary
    epsilon = 0.1 # arbitrary

    env = RaceTrack(track_map, render_mode, size=20)
    action_space = env.nA # (9, ) nine actions
    observation_space = env.nS # (row, col, cur_speed_x, cur_speed_y)

    # Initialize action values (Q) and count (C)
    Q = np.random.normal(size=(*observation_space, action_space))
    Q -= 500 # if we initialize Q with mean 0, var 1, then it is too optimistic because 
    # at start, the algoritm will be full of negative values.
    C = np.zeros_like(Q)
    target_pi = np.argmax(Q, axis=-1)  # the index of the largest action
    reward_hist = np.zeros(shape=(total_episodes), dtype=np.float32)

    for i in range(total_episodes):
        trajectory = []
        terminated = False
        state, info = env.reset()
        (action, act_prob) = behavior_pi(state, env.nA, target_pi, epsilon)

        total_reward = 0

        # print("Sampling b")
        # Sample from using b
        while not terminated:
            if zero_acc and np.random.rand() <= 0.1:
                observation, reward, terminated, _ = env.step(4)
            else:
                observation, reward, terminated, _ = env.step(action)
            
            total_reward += reward
            trajectory.append((state, action, reward, act_prob))
            state = observation
            (action, act_prob) = behavior_pi(state, env.nA, target_pi, epsilon)



        G = 0
        W = 1
        while trajectory:
            (state, action, reward, act_prob) = trajectory.pop()
            G = gamma * G + reward
            C[state][action] = C[state][action] + W
            Q[state][action] = Q[state][action] + (W/C[state][action]) * (G - Q[state][action])

            target_pi[state] = np.argmax(Q[state])
            if action != target_pi[state]:
                break
            W = W * (1 / act_prob)


        reward_hist[i] = total_reward

        if i % 10000 == 0:
            print(f'Episode: {i}, reward: {total_reward}, epsilon: {epsilon}')

    return reward_hist, Q

In [7]:
def plot_result(value_hist:dict, total_episodes) -> None:
    
    line_width = 1.2
    fontdict = {'fontsize': 12, 'fontweight': 'bold'}

    plt.figure(figsize=(10, 6), dpi=150)
    plt.ylim((-500.0, 0.0))
    plt.grid(c='lightgray')
    plt.margins(0.02)

    # Draw/remove axis lines
    for i, spine in enumerate(plt.gca().spines.values()):
        if i in [0, 2]:
            spine.set_linewidth(1.5)
            continue
        spine.set_visible(False)
    
    x = np.arange(total_episodes)
    plt.xscale('log')
    plt.xticks([1, 10, 100, 1000, 10_000, 100_000, 1_000_000], 
               ['1', '10', '100', '1000', '10,000', '100,000', '1,000,000'])

    colors = ['tomato', 'cornflowerblue']
    for i, (key, value) in enumerate(value_hist.items()):
        title, label = key.split(',')
        plt.plot(x, uniform_filter(value, size=20), 
                 linewidth=line_width, 
                 label=label,
                 c=colors[i],
                 alpha=0.95)

    plt.title(title + ' training record', fontdict=fontdict)
    plt.xlabel('Episodes (log scale)', fontdict=fontdict)
    plt.ylabel('Rewards', fontdict=fontdict)    
    plt.legend()
    plt.savefig(f'./tracks/{"_".join(title.lower().split())}.png')
    plt.show()


In [None]:
if __name__ == "__main__":

    train = True # Switch between train and evaluation
    track_sel = 'a'
    total_episodes = 100_000
  
    if train:
        reward_hist_dict = dict()
        Q_dict = dict()

        for i in range(2):
            track_name = f'Track {track_sel.capitalize()}'
            use_zero_acc = 'with zero acc.' if i else 'without zero acc.'
            key = track_name + ',' + use_zero_acc

            # print("Entering MC")
            reward_hist, Q = off_policy_monte_carlo(total_episodes, track_sel, None, i)
            # print("Exiting MC")
            reward_hist_dict[key] = reward_hist
            Q_dict[key] = Q
        
        plot_result(reward_hist_dict, total_episodes)
        with open(f'./history/exercise_5_12/track_{track_sel}.pkl', 'wb') as f:
            pickle.dump(Q_dict, f)

    else: # Evaluate the Q values and plot sample paths

        with open(f'./history/exercise_5_12/track_{track_sel}.pkl', 'rb') as f:
            Q_dict = pickle.load(f)

        key = list(Q_dict.keys())[0]
        Q = Q_dict[key]
        policy = np.argmax(Q, axis=-1) # greedy policy
        
        env = RaceTrack(track_sel, None, 20)
        fig = plt.figure(figsize=(12, 5), dpi=150)
        fig.suptitle('Sample trajectories', size=12, weight='bold')

        for i in range(10):
            track_map = np.copy(env.track_map)
            state, obs = env.reset()
            terminated = False
            
            while not terminated:
                track_map[state[0], state[1]] = 0.6 
                action = policy[state]
                next_state, reward, terminated = env.step(action)
                state = next_state

            ax = plt.subplot(2, 5, i + 1)
            ax.axis('off')
            ax.imshow(track_map, cmap='GnBu')
           
        plt.tight_layout()
        plt.savefig(f'./tracks/track_{track_sel}_paths.png')
        plt.show()

In [None]:
## TEST CODE FOR THE RANDOM POLICY PROGRAM

agent = RaceTrack('b', 'human')

agent.reset()
observation, reward, terminated, truncated = agent.step(8)
print("Initial Observation: ", observation[0], observation[1], " with speed: ", observation[2], observation[3])

while True:
    observation, reward, terminated, truncated = agent.step(np.random.randint(0,9))
    print("At: ", observation[0], observation[1], " with speed: ", observation[2], observation[3])
    if terminated == True:
        agent.reset()
