# Importing all the files 

In [None]:
import numpy as np 
import gymnasium as gym 
import matplotlib.pyplot as plt 
import random
from statistics import mean
from IPython.display import clear_output
from IPython import display

# Creating the Environment

In [None]:
env = gym.make('CliffWalking-v0', render_mode = 'rgb_array')
env.reset()

# Creating the Q(s|a) value table

In [None]:
action_values = np.random.rand(48,4)

### Testing the value with state (0,0)

In [None]:
action_values[0]

# Defining the policy

In [None]:
def policy(state, epsilon=0.2):
    action_probablities = action_values[state]
    if random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(action_probablities)
    return action


# Implementing the algorithm

In [None]:
def on_policy_mc_cotrol(policy:callable, action_values, episodes :  int = 10000, gamma: int = 0.99, epsilon:int = 0.2) -> None:
    sa_returns = np.empty(shape=(48, 4), dtype= object)
    

    for episode in range(episodes + 1):
        trajectory = []
        state, _ = env.reset()
        done = False
        truncated = False
        while not done: 
            action = policy(state, epsilon)
            next_state, reward, done, truncated, info = env.step(action)
            trajectory.append([state, action, reward])
            state = next_state
            env.render()
            if reward == -100:
                done = True
    
        print(episode)

        G = 0

        for state_t, action_t, reward_t in reversed(trajectory):
            G = reward_t + gamma * G
            if sa_returns[state_t, action_t] == None:
                sa_returns[state_t, action_t] = [G]
            else:
                sa_returns[state_t, action_t].append(G)
            action_values[state_t, action_t] = mean(sa_returns[state_t, action_t])
        

In [None]:
on_policy_mc_cotrol(policy, action_values, episodes = 10000,epsilon= 0.75)

In [None]:
print(action_values)

In [None]:
action_values = np.load('epsilon10.npy')

In [None]:
def test_agent(policy, action_values, episodes=1, epsilon = 0.2):
    for episode in range(episodes+1):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy(state, epsilon)
            next_state, reward, done, _ , _ = env.step(action)
            frame = env.render()
            if reward == -100:
                done = True
            state = next_state
            if env.render_mode == "rgb_array":
                plt.imshow(frame)
                plt.axis = ('off')
                display.display(plt.gcf())
                display.clear_output(wait = True)


In [None]:
test_agent(policy, action_values,epsilon= 0)