In [7]:
import numpy as np
import gym

In [8]:
def eps_greedy(Q,s,eps=0.1):
    if np.random.uniform(0,1) < eps:
      return np.random.randint(Q.shape[1])
    else:
        return greedy(Q,s)

In [9]:
def greedy(Q,s):
    return np.argmax(Q[s])

In [10]:
def run_episodes(env, Q, num_episodes=100, to_print=False):
    tot_rew = [] #total reward
    state = env.reset()

    for _ in range(num_episodes):
        done = False
        game_rew = 0

        while not done:
            next_state, rew, done, _ =env.step(greedy(Q, state))

            state = next_state
            game_rew += rew
            if done:
                state = env.reset()
                tot_rew.append(game_rew)

    if to_print:
        print('Mean score: %.3f of %1 games!'%(np.mean(tot_rew), num_episodes))

    return np.mean(tot_rew)

In [11]:
def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    Q = np.zeros((nS, nA))
    games_rewards=[]
    test_rewards=[]

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0

        if eps>0.01:
            eps -= eps_decay


        action = eps_greedy(Q, state, eps)

        while not done:
            next_state, rew, done, _ = env.step(action)

            next_action = eps_greedy(Q, next_state, eps)

            #Bellman's Equation
            Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])

            state = next_state
            action = next_action
            tot_rew += rew
            if done:
                games_rewards.append(tot_rew)

        if (ep % 300) == 0:
              test_rew =run_episodes(env, Q, 1000)
              print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
              test_rewards.append(test_rew)
    return Q

SARSA_Taxi_V3

In [12]:
if __name__ == '__main__':
    env = gym.make('Taxi-v3')
    print("SARSA")
    Q_sarsa = SARSA(env, lr=0.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

SARSA
Episode:    0  Eps:0.3990  Rew:-268.2560
Episode:  300  Eps:0.0990  Rew:-220.8590
Episode:  600  Eps:0.0100  Rew:-174.3740
Episode:  900  Eps:0.0100  Rew:-213.8030
Episode: 1200  Eps:0.0100  Rew:-99.1470
Episode: 1500  Eps:0.0100  Rew:-44.1420
Episode: 1800  Eps:0.0100  Rew:-29.6340
Episode: 2100  Eps:0.0100  Rew:-15.8950
Episode: 2400  Eps:0.0100  Rew:-8.1230
Episode: 2700  Eps:0.0100  Rew:-3.8400
Episode: 3000  Eps:0.0100  Rew:4.2410
Episode: 3300  Eps:0.0100  Rew:5.8390
Episode: 3600  Eps:0.0100  Rew:7.8370
Episode: 3900  Eps:0.0100  Rew:7.8440
Episode: 4200  Eps:0.0100  Rew:8.0350
Episode: 4500  Eps:0.0100  Rew:8.0000
Episode: 4800  Eps:0.0100  Rew:7.8140


Q-LEARNING

initialize Q Matrix
Decay The Epsilon Until It Reaches The Threshold Choose Next Action SARSA Update Testing The Policy

In [13]:
def Q_Learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    Q = np.zeros((nS, nA))
    games_rewards=[]
    test_rewards=[]

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0

        if eps>0.01:
            eps -= eps_decay

        while not done:

            action = eps_greedy(Q, state, eps)
            next_state, rew, done, _ = env.step(action)

            #Bellman's Equation
            Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])

            state = next_state
            tot_rew += rew
            if done:
                games_rewards.append(tot_rew)

        if (ep % 300) == 0:
              test_rew =run_episodes(env, Q, 1000)
              print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
              test_rewards.append(test_rew)
    return Q

In [14]:
if __name__ == '__main__':
    env = gym.make('Taxi-v3')
    print("Q_Learning")
    Q_Learning = Q_Learning(env, lr=0.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)


Q_Learning
Episode:    0  Eps:0.3990  Rew:-210.8000
Episode:  300  Eps:0.0990  Rew:-198.9270
Episode:  600  Eps:0.0100  Rew:-192.3180
Episode:  900  Eps:0.0100  Rew:-184.8410
Episode: 1200  Eps:0.0100  Rew:-90.6760
Episode: 1500  Eps:0.0100  Rew:-73.8650
Episode: 1800  Eps:0.0100  Rew:-30.7650
Episode: 2100  Eps:0.0100  Rew:-18.8660
Episode: 2400  Eps:0.0100  Rew:-6.5620
Episode: 2700  Eps:0.0100  Rew:-5.6830
Episode: 3000  Eps:0.0100  Rew:3.6980
Episode: 3300  Eps:0.0100  Rew:5.2140
Episode: 3600  Eps:0.0100  Rew:8.0420
Episode: 3900  Eps:0.0100  Rew:7.9150
Episode: 4200  Eps:0.0100  Rew:7.9460
Episode: 4500  Eps:0.0100  Rew:7.9660
Episode: 4800  Eps:0.0100  Rew:7.8270
