# Importing all the libraries

In [None]:
import numpy as np 
import matplotlib.pyplot as plt

from env import CliffWalking

# Creating the environment

In [None]:
env = CliffWalking()
env.reset()
env.render()

# Creating the Q(s,a) table

In [None]:
action_values = np.zeros((48, 4))

# Creating the target policy

In [None]:
def target_policy(state):
    av = action_values[state]
    return np.random.choice(np.flatnonzero(av == av.max()))

## Creating the exploratory policy

In [None]:
def exploratory_policy(state):
    return np.random.choice(4)

# Implementing the algortithm

In [None]:
def q_learning(action_values, exploratory_policy, target_policy, episodes = 10000, alpha = 0.1, gamma = 0.99):
    for episode in range(1, episodes + 1):
        state = env.reset()
        done, terminated = False, False
        print(episode)
        while not done or terminated:
            action = exploratory_policy(state)
            next_state, reward, done, terminated = env.step(action)
            next_action = target_policy(next_state)

            qsa = action_values[state][action]
            next_qsa = action_values[next_state][next_action]
            action_values[state][action] = qsa + alpha * (reward + gamma * next_qsa - qsa)

            state, action = next_state, next_action

In [None]:
q_learning(action_values, exploratory_policy, target_policy, episodes = 1000)

# Showing results

In [None]:
print(action_values)

In [None]:
def test_agent(policy, episodes=3, epsilon=0):
    env.pygame_init()
    for episode in range(episodes):
        state = env.reset()
        done, terminated = False, False
        while not (done or terminated):
            action = policy(state)
            next_state, reward, done, terminated = env.step(action)
            frame = env.render()
            state = next_state
        print(episode+1)

In [None]:
test_agent(target_policy)

In [None]:
env.close()