In [None]:
## Reinforcement Learning ##
# You have an agent that explores space (Think Pac-Man)
# It learns the value of different state changes in different conditions
# These values inform agent's future behaviour
# When space explored = fast online performance

## Q-Learning ##
# Specific Reinforcement Learning implementation
# Set of environmental states (s), possible actions in each state (a) and a value of each state/action (Q)
# Pac-Man: s is wall/ghost etc., a is movement and Q is positive/negative value
# Start of with Q values of 0, explore space.
# Bad things (ghost eats you), reduce Q. Good things (eat ghost) increase Q
# Discount factor: look beyond one step. s: previous state, s' is current state
# Q(s,a) += discount*(reward(s,a)+max(Q(s'))-Q(s,a))

In [None]:
## Exploration problem (Can be called Markov decision process (MDP)) ##
# What's the best way to explore all possible states
# Simple option: For state s, choose action with highest Q. If tie, random choice
# Can miss paths this way, so better to use a Epsilon Term
# If a random no. < epsilon, use random choice vs following highest Q
# Here exploration never stops but choosing epsilon can be tricky

In [5]:
# Taxi Problem: Build self-driving taxi to pick up and drop off at fixed pts
# Get there quickest while avoiding obstacles
import gym,random

random.seed(1234)
streets = gym.make("Taxi-v3").env
streets.render()

# R,G,B,Y = pick up/drop off locations
# Blue = pick up passenger, Magenta= drop them off
# | are obstacles, : are OK
# 5x5 grid so 25 locations, 4 destinations. Passenger can be in taxi or at one pt (5 pts)
# 25*4*5 = 500 states. Six possible actions: pickup, drop off and move North,South,East and West
# Q-Learning awards 20pts for drop off, -1pt per step and illegal action -10pt

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+



In [9]:
# Initial state: taxi (Yellow) at (2,3), pickup at 2 and drop off at 0
initialState = streets.encode(2,3,2,0)
streets.s = initialState # street state
streets.render()
streets.P[initialState] # scoring

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



{0: [(1.0, 368, -1, False)],
 1: [(1.0, 168, -1, False)],
 2: [(1.0, 288, -1, False)],
 3: [(1.0, 248, -1, False)],
 4: [(1.0, 268, -10, False)],
 5: [(1.0, 268, -10, False)]}

In [12]:
# Train 10k taxi runs, each has 10% chance of exploring (discount factor)
import numpy as np

#
q_table = np.zeros([streets.observation_space.n,streets.action_space.n])
learningRate = 0
discountFactor = 0.6
exploration = 0.1
epochs = 10000

# For each run, reset field. 
for run in range(epochs):
    state = streets.reset()
    finished = False
    
    # Draw random number between 0 +1
    # If less than exploration choose random action
    # Else highest Q
    while not finished:
        randomValue = random.uniform(0,1)
        if(randomValue < exploration):
            action = streets.action_space.sample()
        else:
            action = np.argmax(q_table[state])
            
        nextState, scoring, finished, info = streets.step(action)
        
        # Q-Learning
        previousQ = q_table[state,action]
        nextMaxQ = np.max(q_table[nextState])
        newQ = (1 -learningRate)*previousQ+learningRate*(scoring+discountFactor*nextMaxQ)
        state = nextState

KeyboardInterrupt: 