In [1]:
import numpy as np
import gym

In [2]:
def eps_greedy(Q,s,eps=0.1):
  if np.random.uniform(0,1) < eps:
    return np.random.randint(Q.shape[1])
  else:
    return greedy(Q,s)

  and should_run_async(code)


# **Greedy Policy**


> **Returning to maximum state value**








In [3]:
def greedy(Q,s):
  return np.argmax(Q[s])

## **Policy Testing**

In [4]:
def run_episodes(env,Q,num_episodes=100,to_print=False):
  tot_rew=[]
  state=env.reset()

  for _ in range(num_episodes):
    done=False
    game_rew=0

    while not done:
      next_state,rew,done,_=env.step(greedy(Q,state))

      state=next_state
      game_rew += rew
      if done:
        state=env.reset()
        tot_rew.append(game_rew)
    if to_print:
        print('Mean score: %3f of %i games!'%(np.mean(tot_rew),num_episodes))

    return np.mean(tot_rew)

# SARSA


*  Initialize Q matrix
*  Decay the epsilon untill it reaches the threshold
*  Choose next action
*   SARSA update


*  Testing the policy





In [5]:
def SARSA(env,lr=0.01,num_episodes=10000,eps=0.3,gamma=0.95,eps_decay=0.00005):
  nA = env.action_space.n
  nS= env.observation_space.n

  Q=np.zeros((nS,nA))
  games_rewards=[]
  test_rewards=[]

  for ep in range(num_episodes):
    state= env.reset()
    done= False
    tot_rew= 0

    if eps>0.01:
        eps -= eps_decay

    action= eps_greedy(Q,state,eps)

    while not done:
      next_state,rew,done,_ =env.step(action)

      next_action = eps_greedy(Q,next_state,eps)
      #BellMan Equation
      Q[state][action] = Q[state][action] + lr*(rew+gamma*Q[next_state][next_action]-Q[state][action])

      state = next_state
      action = next_action
      tot_rew += rew
      if done:
          games_rewards.append(tot_rew)

    if (ep%300) == 0:
      test_rew=run_episodes(env,Q,1000)
      print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep,eps,test_rew))
      test_rewards.append(test_rew)

  return Q



# **SARSA-Taxi v3 Data**



In [6]:
if __name__=='__main__':
  env=gym.make('Taxi-v3')
  print('SARSA')
  Q_sarsa = SARSA(env, lr=0.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

  deprecation(
  deprecation(


SARSA
Episode:    0 Eps:0.3990 Rew:-200.0000
Episode:  300 Eps:0.0990 Rew:-200.0000
Episode:  600 Eps:0.0100 Rew:-200.0000
Episode:  900 Eps:0.0100 Rew:-200.0000
Episode: 1200 Eps:0.0100 Rew:10.0000
Episode: 1500 Eps:0.0100 Rew:-200.0000
Episode: 1800 Eps:0.0100 Rew:8.0000
Episode: 2100 Eps:0.0100 Rew:6.0000
Episode: 2400 Eps:0.0100 Rew:6.0000
Episode: 2700 Eps:0.0100 Rew:7.0000
Episode: 3000 Eps:0.0100 Rew:11.0000
Episode: 3300 Eps:0.0100 Rew:9.0000
Episode: 3600 Eps:0.0100 Rew:5.0000
Episode: 3900 Eps:0.0100 Rew:5.0000
Episode: 4200 Eps:0.0100 Rew:8.0000
Episode: 4500 Eps:0.0100 Rew:8.0000
Episode: 4800 Eps:0.0100 Rew:7.0000


# **Q-Learning**

*  Initialize Q matrix
*  Decay the epsilon untill it reaches the threshold
*  Choose next action
*  SARSA update
*  Testing the policy


In [7]:
def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    Q = np.zeros((nS, nA))
    games_rewards = []
    test_rewards = []

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0


        if eps > 0.01:
            eps -= eps_decay


        while not done:

            action = eps_greedy(Q, state, eps)

            next_state, rew, done, _ = env.step(action)


            Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])

            state = next_state
            tot_rew += rew
            if done:
                games_rewards.append(tot_rew)


        if (ep % 300) == 0:
            test_rew = run_episodes(env, Q, 1000)
            print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
            test_rewards.append(test_rew)

    return Q


In [8]:
if __name__ == '__main__':
    env = gym.make('Taxi-v3')
    print('Q-Learning')

    Q_qlearning = Q_learning(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

Q-Learning
Episode:    0  Eps:0.3990  Rew:-200.0000
Episode:  300  Eps:0.0990  Rew:-200.0000
Episode:  600  Eps:0.0100  Rew:13.0000
Episode:  900  Eps:0.0100  Rew:-200.0000
Episode: 1200  Eps:0.0100  Rew:11.0000
Episode: 1500  Eps:0.0100  Rew:8.0000
Episode: 1800  Eps:0.0100  Rew:15.0000
Episode: 2100  Eps:0.0100  Rew:-200.0000
Episode: 2400  Eps:0.0100  Rew:11.0000
Episode: 2700  Eps:0.0100  Rew:11.0000
Episode: 3000  Eps:0.0100  Rew:13.0000
Episode: 3300  Eps:0.0100  Rew:11.0000
Episode: 3600  Eps:0.0100  Rew:8.0000
Episode: 3900  Eps:0.0100  Rew:7.0000
Episode: 4200  Eps:0.0100  Rew:9.0000
Episode: 4500  Eps:0.0100  Rew:11.0000
Episode: 4800  Eps:0.0100  Rew:5.0000
