In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [56]:
class Racetrack:
    
    #initialize a 50x50 racetrack with given start and goal states
    def __init__(self,start_states,goal_states,input_track):
        self.track = np.zeros((50,50),dtype = 'int')
        self.start_states = start_states
        self.goal_states = goal_states
        self.action_dict = {"h_s-v_s": 0, "h_u-v_s": 1, "h_s-v_u": 2,
                            "h_u-v_u": 3, "h_d-v_s": 4, "h_s-v_d": 5,
                            "h_d-v_d": 6, "h_u-v_d": 7, "h_d-v_u": 8}
        self.action_coords = [(0,0),(0,1),(1,0),(1,1),(0,-1),(-1,0),(-1,-1),(-1,1),(1,-1)]
        self.track_setup(start_states, goal_states,input_track)
        self.reset()
    
    #Setup the track with 1s, start states with 2s and goal states with 3s
    def track_setup(self,start_states, goal_states,input_track):
        for i in range(len(input_track)):
            self.track[input_track[i]] = 1
        for i in range(len(start_states)):
            self.track[start_states[i]] = 2
        for i in range(len(goal_states)):
            self.track[goal_states[i]] = 3
    
    #choose a random start state among our start states and reset velocity
    def reset(self):
        i = len(self.start_states)
        self.state = self.start_states[np.random.randint(0,i)]
        self.vel_horiz = 0
        self.vel_verti = 0
        
    #updates the agents state
    def step(self,action):
        self.vel_horiz += self.action_coords[action][0]
        self.vel_verti += self.action_coords[action][1]
        self.state = (self.state[0] + self.vel_horiz, self.state[1] + self.vel_verti)
        if self.crash() == True:
            self.reset()
    
    def plot_track(self):
        fig, ax = plt.subplots()
        im = ax.imshow(self.track)
        plt.show()
    
    def plot_trajectory(self,S):
        for t in range(len(S)):
            midi = self.track[S[t]]
            self.track[S[t]] = 5
            fig, ax = plt.subplots()
            im = ax.imshow(self.track)
            plt.show()
            plt.pause(0.4)
            plt.close()
            self.track[S[t]] = midi
            
    #If we intersect the boundary at any point that is not at the finish line we reset
    def crash(self):
        if self.state[0] < 0 or self.state[1] < 0 or self.state[0]>49 or self.state[1] > 49:
            return True
        elif self.track[self.state] == 0:
            return True
        else:
            return False
    
    #Find out which actions leave us with nonnegative velocity and velocity <5
    def allowed_actions(self):
        actions_allowed = []
        x,y = self.vel_horiz, self.vel_verti
        for i in range(len(self.action_coords)):
            if self.vel_horiz + self.action_coords[i][0] >= 0 and self.vel_verti + self.action_coords[i][1] >= 0:
                if self.vel_horiz + self.action_coords[i][0] < 5 and self.vel_verti + self.action_coords[i][1] <5:
                    actions_allowed.append(i)
        actions_allowed = np.array(actions_allowed, dtype=int)
        return actions_allowed
    
    
    #The next action following an epsilon-soft policy pi, only allowed actions are taken
    def next_action(self,pi,eps):
        if np.random.rand() > eps:
            a_next = get_argmax(pi[self.state])
            if a_next in self.allowed_actions():
                return a_next
            else:
                pi[self.state][a_next] = 0
                return self.next_action(pi,0)
        else:
            q = np.random.randint(0,9)
            if q in self.allowed_actions():
                return q
            else:
                return self.next_action(pi,1)
    
    
    def MC_control(self,eps,gamma,nr_episodes = 5000):
        pi = np.ones((50,50,9))*1/9
        C = np.zeros((50,50,9), dtype = 'int')
        Q = np.zeros((50,50,9))
        r_list = []
        
        for k in range(nr_episodes):
            r = 0
            S = []
            A = []
            s = self.state
            help_bool = True
            while help_bool:
                r -=1
                S.append(s)
                a = self.next_action(pi, eps) 
                A.append(a)
                self.step(a)
                s = self.state
                for i in range(len(self.goal_states)):
                    if s == self.goal_states[i]:
                        self.reset()
                        help_bool = False
            G = 0
            r_list.append(r)
            #if k == 4999:
                #self.plot_trajectory(S)
                #print(S)
                #print(A)
            
            for i in range(len(S)-1,-1,-1):
                G = gamma*G - 1
                C[S[i]][A[i]] += 1
                Q[S[i]][A[i]] = Q[S[i]][A[i]] + 1/C[S[i]][A[i]]*(G - Q[S[i]][A[i]])
                A_star = get_argmax(Q[S[i]])
                for j in range(9):
                    if j == A_star:
                        pi[S[i]][j] = 1 - eps + eps/9
                    else:
                        pi[S[i]][j] = eps/9
        plt.plot(np.arange(nr_episodes),r_list)
        plt.show()
        return Q, pi
    
    
def get_argmax(Q):
        maxim = Q[0]-1
        for i in range(9):
            if Q[i] > maxim:
                index = []
                index.append(i)
                maxim = Q[i]
            elif Q[i] == maxim:
                index.append(i)
        return index[np.random.randint(0,len(index)+0.5)]

In [61]:
input_track = []
for i in range(50):
    for j in range(20,40):
        input_track.append((j,i))

start_states = []
for j in range(20,40):
    start_states.append((j,0))
    
goal_states = []
for j in range(20,40):
    goal_states.append((j,49))

In [62]:
Race = Racetrack(start_states,goal_states,input_track)

In [63]:
%matplotlib qt
Race.plot_track()

In [64]:
%matplotlib qt
Q, pi = Race.MC_control(0.05,0.7)

In [65]:
print(Q[(22,0)])

[-3.33117282 -3.3286704  -3.33219662 -3.32869161  0.         -3.33058941
  0.         -3.32861737  0.        ]
