# Cliff Walking 

### Setup the environment 

In [1]:
import gym
import random

myenv = gym.make("CliffWalking-v0")

# Optional step
num_rows = 4  # Number of rows in the environment grid
num_cols = 12  # Number of columns in the environment grid
cliff_indexes = [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]  # Cliff positions
valid_states = [state for state in range(myenv.observation_space.n) if state not in cliff_indexes]
initial_state = random.choice(valid_states)
myenv.s = initial_state

myenv.render()

o  o  o  o  o  o  o  o  o  o  x  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T



### Initial state 

In [2]:
x_row = 3
x_col = 0
initial_state = x_row * num_cols + x_col
myenv.s = initial_state
myenv.render()

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T



### Verify that the q_table size is correct

In [3]:
import numpy as np
q_table = np.zeros([myenv.observation_space.n, myenv.action_space.n]) # 48 , 4 
q_table.size

192

### Training  
##### with the Initial values of learning_rate, discount_factor, exploration & epochs

In [4]:
import numpy as np
q_table = np.zeros([myenv.observation_space.n, myenv.action_space.n]) 
G = 0
learning_rate = 0.1
discount_factor = 0.5
exploration = 0.1
epochs = 10000
trip_lengths = []

for episode in range(epochs):
    state = myenv.reset()
    done = False
    G = 0
    steps = 0

    while not done :
        steps += 1
        random_value = random.uniform(0, 1)

        if random_value < exploration:
            action = myenv.action_space.sample()  # Explore a random action
        else:
            action = np.argmax(q_table[state])  # Return the action with the highest q-value

        next_state, reward, done, info = myenv.step(action)  # Perform the chosen action

        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        G += reward
        q_table[state, action] = new_q
        state = next_state
    if episode % 1000 == 0:
        print("Episode:", episode,", Total Reward:", G ,", Steps :", steps)
if epochs % 1000 == 0:
    print("Episode:", epochs, ", Total Reward:", G, ", Steps:", steps)

Episode: 0 , Total Reward: -355 , Steps : 157
Episode: 1000 , Total Reward: -13 , Steps : 13
Episode: 2000 , Total Reward: -13 , Steps : 13
Episode: 3000 , Total Reward: -433 , Steps : 37
Episode: 4000 , Total Reward: -125 , Steps : 26
Episode: 5000 , Total Reward: -15 , Steps : 15
Episode: 6000 , Total Reward: -332 , Steps : 35
Episode: 7000 , Total Reward: -15 , Steps : 15
Episode: 8000 , Total Reward: -13 , Steps : 13
Episode: 9000 , Total Reward: -17 , Steps : 17
Episode: 10000 , Total Reward: -15 , Steps: 15


### Testing and Evaluation

In [5]:
from IPython.display import clear_output
from time import sleep

num_episodes = 10
total_rewards = 0
total_steps = 0

for episode in range(num_episodes):
    state = myenv.reset()
    done = False
    steps = 0
    rewards = 0

    while not done and steps < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = myenv.step(action)

        clear_output(wait=True)
        print("Episode:", episode + 1, "Step:", steps + 1)
        print(myenv.render(mode='ansi'), end="\r")  # Use end="\r" to overwrite the previous output
        print("Current state:", state)  # Print the current state
        print("Chosen action:", action)  # Print the chosen action
        sleep(.2)
        
        state = next_state
        steps += 1
        rewards += reward

    total_rewards += rewards
    total_steps += steps

average_rewards = total_rewards / num_episodes
average_steps = total_steps / num_episodes

print("Average Rewards:", average_rewards)
print("Average Steps:", average_steps)


Episode: 10 Step: 13
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  x

Current state: 35
Chosen action: 2
Average Rewards: -13.0
Average Steps: 13.0


                                                                                                  Hala Khalifeh