# Importing all the necessary libraries 

In [None]:
from env import CliffWalking
import numpy as np 
import matplotlib.pyplot as plt 

# Initialising the Environment

In [None]:
env = CliffWalking()
env.reset()
env.render()

# Define the Q-value table Q(s,a)

In [None]:
action_values = np.full((48, 4), -100)
action_values[47, :] = 0.

In [None]:
print(action_values)

# Define the target policy

In [None]:
def target_policy(state):
    av = action_values[state]
    return np.random.choice(np.flatnonzero(av == av.max()))

In [None]:
action = target_policy(0)
print(f"The action taken at 0 is {action}")

# Define the explaratory policy

In [None]:
def explaratory_policy(state, epsilon = 0):
    if  np.random.random() < epsilon:
        return np.random.choice(4)
    else:
        av = action_values[state]
        return np.random.choice(np.flatnonzero(av == av.max()))

In [None]:
action = explaratory_policy(0)
print(f"The action taken at 0 is {action}")

# Implement the algorithm

In [None]:
def off_policy_mc_control(action_values, target_policy, explaratory_policy, episodes=10000, gamma=0.99, epsilon=0.2):
    csa = np.zeros((48, 4))

    for episode in range(1, episodes+1):
        G = 0
        W = 1
        state = env.reset()
        done = False
        terminated = False
        transitions = []

        while not done or terminated:
            action = explaratory_policy(state, epsilon)
            next_state, reward, done, terminated = env.step(action)
            transitions.append([state, action, reward])
            state = next_state

        for state_t, action_t, reward_t in reversed(transitions):
            G = reward_t + gamma * G
            csa[state_t][action_t] += W
            qsa = action_values[state_t][action_t]
            action_values[state_t][action_t] += (W / csa[state_t][action_t]) * (G - qsa)

            if action_t != target_policy(state_t):
                break

            W = W * 1. / (1 - epsilon + epsilon / 4)
        print(episode)

In [None]:
off_policy_mc_control(action_values, target_policy, explaratory_policy)

In [None]:
print(action_values)

# Test agent function 

In [None]:
def test_agent(policy, episodes=1, epsilon=0.2):
    env.pygame_init()
    for episode in range(episodes):
        state = env.reset()
        done, terminated = False, False
        while not (done or terminated):
            action = policy(state)
            next_state, reward, done, terminated = env.step(action)
            frame = env.render()
            state = next_state
        print(episode+1)

In [None]:
test_agent(target_policy, episodes=3)

In [None]:
env.close()