In [2]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# R matrix
R = np.array([[-1,-1,-1,-1,0,-1], 
              [-1,-1,-1,0,-1,100],
              [-1,-1,-1,0,-1,-1],
              [-1,0,0,-1,0,-1],
              [-1,0,0,-1,-1,100],
              [-1,0,-1,-1,0,100]])
print(f'Reward Matrix \n \n {R}')

Reward Matrix 
 
 [[ -1  -1  -1  -1   0  -1]
 [ -1  -1  -1   0  -1 100]
 [ -1  -1  -1   0  -1  -1]
 [ -1   0   0  -1   0  -1]
 [ -1   0   0  -1  -1 100]
 [ -1   0  -1  -1   0 100]]


In [4]:
# Q Matrix
Q = np.array(np.zeros([6,6]))
print(f'Q Matrix \n \n {Q}')

Q Matrix 
 
 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [5]:
# Initial State - choosen at random
initial_state = 1

# Gamma (discount paramaters)
gamma = 0.8

In [6]:
np.where(R[1]>=0)[0]

array([3, 5], dtype=int64)

In [7]:
# Let's now return all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state]
    aaction = np.where(current_state_row >=0)[0]
    return aaction

# Get available actions in the current state
available_act = available_actions(initial_state)
available_act

array([3, 5], dtype=int64)

In [8]:
# Next action to be performed
def next_action(available_action_range):
    naction = int(np.random.choice(available_act,1))
    return naction

# Action to be performed
action = next_action(available_act)
action

3

In [9]:
# Update Q Matrix
def update(current_state, action, gamma):

    max_index = np.where(Q[action,] == np.max(Q[action,]))[0]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index,size=1))
    else:
        max_index = int(max_index)
    
    max_value = Q[action, max_index]

    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Update Q-matrix
update(initial_state, action, gamma)

In [10]:
# Training for 10000 iterations
for i in range(10000):
    current_state = np.random.randint(0,int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = next_action(available_act)
    update(current_state, action, gamma)
    
# Normalize the Q matrix
print(f'Trained Q-Matrix \n \n {Q/np.max(Q)*100}')

Trained Q-Matrix 
 
 [[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [  0.   80.   51.2   0.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


In [11]:
current_state = 2
steps = [current_state]

while current_state !=5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[0]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index,size=1))
    else:
        next_step_index = int(next_step_index)
    
    steps.append(next_step_index)
    current_state = next_step_index

In [12]:
# Print selected sequence of steps
print(f'Selected Path {steps}')

Selected Path [2, 3, 1, 5]


In [13]:
import gymnasium as gym

env = gym.make("CartPole-v1", render_mode="human")
env.reset()

for _ in range(200):
    env.render()
    env.step(env.action_space.sample()) # take a random action

In [14]:
# action space
env.action_space

Discrete(2)

In [15]:
# state or observation space
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [16]:
# check box bounds
print(f'High: {env.observation_space.high}')
print(f'Low: {env.observation_space.low}')

High: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
Low: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]


In [17]:
env.reset()
for e in range(1, 200):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action) # stepping forward one step
    
    print(f'step={e:2d} | state={observation} | action={action} | reward={reward}')
    
    if (terminated or truncated) and (e + 1) <= 200: # failure if less than 200 steps
        print('*** FAILED ***')
        break

step= 1 | state=[ 0.02241125 -0.17897816 -0.02181121  0.30261102] | action=0 | reward=1.0
step= 2 | state=[ 0.01883169  0.01644775 -0.01575899  0.00312989] | action=1 | reward=1.0
step= 3 | state=[ 0.01916064  0.21179211 -0.01569639 -0.29448324] | action=1 | reward=1.0
step= 4 | state=[ 0.02339648  0.4071343  -0.02158606 -0.59207493] | action=1 | reward=1.0
step= 5 | state=[ 0.03153917  0.6025517  -0.03342756 -0.89147854] | action=1 | reward=1.0
step= 6 | state=[ 0.0435902   0.40789875 -0.05125713 -0.6094882 ] | action=0 | reward=1.0
step= 7 | state=[ 0.05174818  0.6036984  -0.06344689 -0.9178648 ] | action=1 | reward=1.0
step= 8 | state=[ 0.06382214  0.79961807 -0.08180419 -1.2297939 ] | action=1 | reward=1.0
step= 9 | state=[ 0.07981451  0.60563827 -0.10640007 -0.9638209 ] | action=0 | reward=1.0
step=10 | state=[ 0.09192728  0.41209424 -0.12567648 -0.7063692 ] | action=0 | reward=1.0
step=11 | state=[ 0.10016916  0.2189168  -0.13980387 -0.45574015] | action=0 | reward=1.0
step=12 | 