<a href="https://colab.research.google.com/github/HanlalaIbrahim/AI/blob/main/Q_learning_%26_SARSA_Taxi_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import gym

In [2]:
def eps_greedy(Q, s, eps=0.1):
  if np.random.uniform(0,1)<eps:
    return np.random.randint(Q.shape[1])
  else:
    return greedy(Q, s)

## **Greedy** **Policy**

> *Returning to Maximum Action State Value*



In [3]:
def greedy(Q, s):
    return np.argmax(Q[s])

## **Policy** **Testing**

In [17]:
def run_episodes(env, Q, num_episodes=100, to_print=False):
  tot_rew = []
  state = env.reset()

  for _ in range(num_episodes):
    done = False
    game_rew = 0

    while not done:
      next_state, rew, done, _ = env.step(greedy(Q, state))

      state = next_state
      game_rew += rew
      if done:
        state = env.reset()
        tot_rew.append(game_rew)

  if to_print:
    print('Mean Score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes))

  return np.mean(tot_rew)

#**SARSA**




*  initialize Q Matrix
*   Decay tge epsion untill it reaches the threshold
*   Choose next Action
*   SARSA Update
*   Testing the Policy





#      **Bellman** **Equation**
      
      Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])

In [18]:
def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
  nA = env.action_space.n
  nS = env.observation_space.n

  Q = np.zeros((nS, nA))
  games_rewards = []
  test_rewards = []

  for ep in range(num_episodes):
    state =  env.reset()
    done = False
    tot_rew = 0

    if eps > 0.01:
      eps -= eps_decay


    action = eps_greedy(Q, state, eps)

    while not done:
      next_state, rew, done, _ = env.step(action)

      next_action =  eps_greedy(Q, next_state, eps)

      Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])

      state = next_state
      action = next_action
      tot_rew += rew
      if done:
        games_rewards.append(tot_rew)

    if (ep % 300) == 0:
      test_rew = run_episodes(env, Q, 1000)
      print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep, eps, test_rew))
      test_rewards.append(test_rew)

  return Q

# **SARSA - Taxi v3 Data**

In [19]:
if __name__ == '__main__':
  env = gym.make('Taxi-v3')
  print('SARSA')
  Q_sarsa = SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005)

SARSA
Episode:    0 Eps:0.2999 Rew:-230.4200
Episode:  300 Eps:0.2850 Rew:-216.0740
Episode:  600 Eps:0.2700 Rew:-200.0000
Episode:  900 Eps:0.2550 Rew:-200.0000
Episode: 1200 Eps:0.2400 Rew:-198.9300
Episode: 1500 Eps:0.2250 Rew:-200.0000
Episode: 1800 Eps:0.2100 Rew:-200.0000
Episode: 2100 Eps:0.1950 Rew:-200.0000
Episode: 2400 Eps:0.1800 Rew:-199.5720
Episode: 2700 Eps:0.1650 Rew:-198.7100
Episode: 3000 Eps:0.1500 Rew:-199.1400
Episode: 3300 Eps:0.1350 Rew:-192.7920
Episode: 3600 Eps:0.1200 Rew:-194.6680
Episode: 3900 Eps:0.1050 Rew:-191.6850
Episode: 4200 Eps:0.0900 Rew:-183.0030
Episode: 4500 Eps:0.0750 Rew:-185.7270
Episode: 4800 Eps:0.0600 Rew:-187.4600
Episode: 5100 Eps:0.0450 Rew:-182.5400
Episode: 5400 Eps:0.0300 Rew:-170.5670
Episode: 5700 Eps:0.0150 Rew:-177.9150
Episode: 6000 Eps:0.0100 Rew:-175.8710
Episode: 6300 Eps:0.0100 Rew:-157.3940
Episode: 6600 Eps:0.0100 Rew:-164.6840
Episode: 6900 Eps:0.0100 Rew:-160.2670
Episode: 7200 Eps:0.0100 Rew:-196.4790
Episode: 7500 Eps:0

# **Q-Learning**

* initilalizing Q matrix
* Decay the epsilon untill it reaches the threshold
* Select Action Following Epsilon-Greedy policy
* Q-Learning updates State-Action Value
* Testing the Policy

In [22]:
def Q_Learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
  nA = env.action_space.n
  nS = env.observation_space.n

  Q = np.zeros((nS, nA))
  games_rewards = []
  test_rewards = []

  for ep in range(num_episodes):
    state =  env.reset()
    done = False
    tot_rew = 0

    if eps > 0.01:
      eps -= eps_decay

    while not done:
      action = eps_greedy(Q, state, eps)

      next_state, rew, done, _ = env.step(action)


      Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max([next_state]) - Q[state][action])

      state = next_state
      tot_rew += rew
      if done:
        games_rewards.append(tot_rew)

    if (ep % 300) == 0:
      test_rew = run_episodes(env, Q, 1000)
      print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep, eps, test_rew))
      test_rewards.append(test_rew)

  return Q

# **Q-Learning - Taxi v3 Data**

In [23]:
if __name__ == '__main__':
  env = gym.make('Taxi-v3')
  print('Q-Learning')
  Q_Learning = Q_Learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005)

Q-Learning
Episode:    0 Eps:0.2999 Rew:-233.9660
Episode:  300 Eps:0.2850 Rew:-524.1350
Episode:  600 Eps:0.2700 Rew:-535.0430
Episode:  900 Eps:0.2550 Rew:-504.8300
Episode: 1200 Eps:0.2400 Rew:-504.5960
Episode: 1500 Eps:0.2250 Rew:-477.7130
Episode: 1800 Eps:0.2100 Rew:-502.4360
Episode: 2100 Eps:0.1950 Rew:-502.7600
Episode: 2400 Eps:0.1800 Rew:-461.7290
Episode: 2700 Eps:0.1650 Rew:-434.9810
Episode: 3000 Eps:0.1500 Rew:-429.4370
Episode: 3300 Eps:0.1350 Rew:-484.3730
Episode: 3600 Eps:0.1200 Rew:-534.1970
Episode: 3900 Eps:0.1050 Rew:-473.7170
Episode: 4200 Eps:0.0900 Rew:-398.4680
Episode: 4500 Eps:0.0750 Rew:-400.6190
Episode: 4800 Eps:0.0600 Rew:-413.1740
Episode: 5100 Eps:0.0450 Rew:-434.6660
Episode: 5400 Eps:0.0300 Rew:-422.1740
Episode: 5700 Eps:0.0150 Rew:-368.2730
Episode: 6000 Eps:0.0100 Rew:-398.7920
Episode: 6300 Eps:0.0100 Rew:-372.1340
Episode: 6600 Eps:0.0100 Rew:-377.4890
Episode: 6900 Eps:0.0100 Rew:-380.9810
Episode: 7200 Eps:0.0100 Rew:-391.8080
Episode: 7500 