In [6]:
###########################################
### 
### Marcus Blaisdell
### Homework 6, Extra Credit 
### April 26, 2019
### Professor Diane Cook
### 
### Q-Learning and SARSA algorithm
### modifications were made in accordance 
### with the algorithms described here:
### https://www.cse.unsw.edu.au/~cs9417ml/RL1/algorithms.html
### 
### The Q-learning and SARSA algorithms
### each run in 25-45 seconds each,
### the ignorant agent takes ~1 hour to run
### 
### The reporting metrics show similar
### average time-steps for all three
### functions and the evaluate_solution
### function reports no penalties for any
### of them using the learned models. 
### 
### Changing the movement reward to -4 
### did not have an apparent effect but I 
### had expected it to produce shorter runs
### since the higher penalty would create
### a higher reward for fewer movements.
### 
###########################################



!pip install cmake 'gym[atari'] scipy

import gym
import numpy as np
import random
from IPython.display import clear_output
import time
from time import sleep
import random

frames = []

def init():
   env = gym.make("Taxi-v2").env
   env.reset() # reset environment to a new random state
  
   print("Action Space {}".format(env.action_space))
   print("State Space {}".format(env.observation_space))
   # (taxi row, taxi column, passenger index, destination index)
   state = env.encode(3, 1, 2, 0) 
   print("State:", state)

   env.s = state
   env.render()
   return env

def print_frames(frames, firstnum, lastnum):
   for i, frame in enumerate(frames):
      if i < firstnum or i > (len(frames) - lastnum):
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.5)
          

####################################
### begin train_agent

def train_agent(env, theReward):
   Q = np.zeros([env.observation_space.n, env.action_space.n])
   alpha = 0.1
   gamma = 0.6
   epsilon = 0.1

   # For plotting metrics
   all_epochs = []
   all_penalties = []

   for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      
      done = False
      while not done:
         if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
         else:
            action = np.argmax(Q[state]) # Exploit learned values

         next_state, reward, done, info = env.step(action) 
         if reward == -1:
            reward = theReward
        
         old_value = Q[state, action]
         next_max = np.max(Q[next_state])
        
         #new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
         ### Using the Q-learning formula from lecture:
        
         new_value = old_value + (alpha * (reward + (gamma * next_max) - old_value))
         Q[state, action] = new_value

         if reward == -10:
            penalties += 1

         state = next_state
         epochs += 1
          
      all_penalties.append (penalties)
        
   print("Training finished.\n")
   print ("Reward: ", reward)
  
   return Q
       
### end train_agent
####################################



####################################
### begin sarsa

def train_agent_sarsa(env, theReward):
   Q = np.zeros([env.observation_space.n, env.action_space.n])
   alpha = 0.1
   gamma = 0.6
   epsilon = 0.1

   # For plotting metrics
   all_epochs = []
   all_penalties = []

   for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      
        # Choose the action determined by the policy:
        
      action = np.argmax(Q[state]) 
      
      while not done:
            
         next_state, reward, done, info = env.step(action) 
         if reward == -1:
            reward = theReward
        
         old_value = Q[state, action]
         next_max = np.max(Q[next_state])
        
         #new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
         
         ### Using the SARSA formula:
          
         new_value = old_value + (alpha * (reward + (gamma * next_max) - old_value))
         Q[state, action] = new_value

         if reward == -10:
            penalties += 1

         state = next_state
         action = np.argmax (Q[state])
          
         epochs += 1
        
   print("Training finished.\n")
   print ("Reward: ", reward)
   return Q
  
### end sarsa
####################################


####################################
### begin ignorant_agent

def train_agent_ignorant(env, theReward):
   Q = np.zeros([env.observation_space.n, env.action_space.n])
   alpha = 0.1
   gamma = 0.6
   epsilon = 0.1

   # For plotting metrics
   all_epochs = []
   all_penalties = []

   for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      
      done = False
      
      while not done:
         #action = np.argmax(Q[state]) # Exploit learned values
          
         action = random.randint (0,5)

         next_state, reward, done, info = env.step(action) 
         if reward == -1:
            reward = theReward
        
         old_value = Q[state, action]
         
         next_max = np.max(Q[next_state])
        
         #new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
         new_value = old_value + (alpha * (reward + (gamma * next_max) - old_value))
         Q[state, action] = new_value

         if reward == -10:
            penalties += 1

         state = next_state
         epochs += 1
        
   print("Training finished.\n")
   print ("reward: ", reward)
   return Q
  
### end ignorant_agent
####################################
    
#################################### 
### begin evaluate_solution 

def evaluate_solution(env, Q, theReward):
   total_epochs, total_penalties = 0, 0
    
   episodes = 100

   for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
    
      done = False 
                                  
      while not done:
         action = np.argmax(Q[state])
         state, reward, done, info = env.step(action)
         if reward == -1:
            reward = theReward
         # Put each rendered frame into dict for animation

         frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward})


         if reward == -10:
            penalties += 1

         epochs += 1

      total_penalties += penalties
      total_epochs += epochs

   print(f"Results after {episodes} episodes:")
   print ("total reward: ", reward)
   print(f"Average timesteps per episode: {total_epochs / episodes}")
   print(f"Average penalties per episode: {total_penalties / episodes}")
  
### end evaluate_solution
####################################

  
### learning algorithms: train and test:

########################################
### begin q_learning:

def q_learning(env, theReward):
   Q = train_agent(env, theReward)
   evaluate_solution(env, Q, theReward)
   #return frames

### end q_learning:
########################################

    
########################################
### begin sarsa_learning:

def sarsa(env, theReward):
   Q = train_agent_sarsa(env, theReward)
   evaluate_solution(env, Q, theReward)
   #return frames
    
### end sarsa_learning:
########################################

########################################
### begin ignorant_agent_learning:

def ignorant_agent(env, theReward):
   Q = train_agent_ignorant(env, theReward)
   evaluate_solution(env, Q, theReward)
   #return frames
    
### end ignorant_agent_learning:
########################################

    
def main ():
  env = init()
  
    ### set animate = 1 to run the animation, 
    ### a value of 'not 1' will not run the animation
  
  animate = 0
  
    ### test with multiple movement rewards
  
  rewardList = [-1, -4]
  
  for theReward in rewardList:
    
    print ("\n\n\t*** Reward: ", theReward, "\n\n")

    
    ###################
    ### run q_learning:

    print ("\t*** q_learning ***\n\n")

    qStartTime = time.time ()
    q_learning(env, theReward)
    qEndTime = time.time ()

    print ("q_learning run time: ", str(qEndTime - qStartTime))

    if (print == 1):
      print_frames(frames, 10, 10)
      
    ### end q_learning
    ##################
    
    
    #######################
    ### run sarsa_learning:
    
    print ("\n\n\t*** SARSA ***\n\n")

    sStartTime = time.time ()
    sarsa(env, theReward)
    sEndTime = time.time ()

    print ("SARSA run time: ", sEndTime - sStartTime)
    
    if (animate == 1):
      print_frames(frames, 10, 10)
      
    ### end sarsa_learning
    ######################
    
    
    ######################
    ### run ignorant_agent

    print ("\n\n\t*** Ignorant Agent ***\n\n")

    iStartTime = time.time ()
    ignorant_agent (env, theReward)
    iEndTime = time.time ()

    print ("ignorant agent run time: ", str(iEndTime - iStartTime))
    
    if (print == 1):
      print_frames(frames, 10, 10)
      
    ### end ignorant agent
    ######################
    
  
if __name__ == "__main__":
  main ()


Action Space Discrete(6)
State Space Discrete(500)
State: 328
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



	*** Reward:  -1 


	*** q_learning ***


Training finished.

Reward:  20
Results after 100 episodes:
total reward:  20
Average timesteps per episode: 12.68
Average penalties per episode: 0.0
q_learning run time:  27.532066345214844


	*** SARSA ***


Training finished.

Reward:  20
Results after 100 episodes:
total reward:  20
Average timesteps per episode: 12.77
Average penalties per episode: 0.0
SARSA run time:  23.26818299293518


	*** Reward:  -4 


	*** q_learning ***


Training finished.

Reward:  20
Results after 100 episodes:
total reward:  20
Average timesteps per episode: 12.43
Average penalties per episode: 0.0
q_learning run time:  27.895546197891235


	*** SARSA ***


Training finished.

Reward:  20
Results after 100 episodes:
total reward:  20
Average timesteps per episode: 12.29
Average penaltie