# Importing all the files 

In [1]:
import numpy as np 
import gymnasium as gym 
import matplotlib.pyplot as plt 
from statistics import mean
from IPython.display import clear_output
from IPython import display
import random

from env import CliffWalking

# Creating the Environment

In [2]:
env = CliffWalking(render_mode="human")
env.reset()
env.render()

# Creating the Q(s|a) value table

In [3]:
action_values = np.random.rand(48,4)

### Testing the value with state (0,0)

In [4]:
action_values[0]

array([0.03897977, 0.4108894 , 0.4185447 , 0.0213924 ])

# Defining the policy

In [5]:
def policy(state, epsilon=0.2):
    action_probablities = action_values[state]
    if random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(action_probablities)
    return action


# Implementing the algorithm

In [6]:
def on_policy_mc_cotrol(policy: callable, action_values, episodes:  int = 10000, gamma: int = 0.99, epsilon: int = 0.2, alpha:int = 0.2) -> None:
    sa_returns = np.empty(shape=(48, 4), dtype=object)

    for episode in range(episodes + 1):
        trajectory = []
        state = env.reset()
        done, terminated = False, False
        while not done or terminated:
            action = policy(state, epsilon)
            next_state, reward, done, terminated = env.step(action)
            trajectory.append([state, action, reward])
            state = next_state

        print(episode)
        G = 0

        for state_t, action_t, reward_t in reversed(trajectory):
            G = reward_t + gamma * G

            qsa = action_values[state_t][action_t]
            action_values[state_t][action_t] += alpha * (G - qsa)
            

In [7]:
on_policy_mc_cotrol(policy, action_values, episodes = 10000)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


KeyboardInterrupt: 

In [None]:
print(action_values)

[[-2.50865675e+002 -3.94206396e+002 -1.26394075e+002 -3.34120662e+002]
 [-2.50442721e+002 -1.62737158e+002 -3.78139280e+002 -2.62752549e+002]
 [-9.44710362e+001 -1.73287831e+001 -8.35544787e+001 -1.44194744e+002]
 [-1.14649370e+002 -2.32470649e+002 -1.59334513e+001 -6.39802135e+001]
 [-2.54423631e+001 -2.05118946e+002 -9.92854032e+001 -5.28424876e+001]
 [-7.49247151e+001 -1.56102628e+001 -1.74909545e+002 -1.38829839e+002]
 [-1.00155207e+002 -1.14160002e+002 -7.45835386e+001 -1.52476433e+001]
 [-3.16885914e+001 -2.03228893e+001 -7.87273706e+001 -1.71729871e+002]
 [-2.45184230e+001 -5.64649500e+000 -2.10303527e+001 -2.13332022e+001]
 [-2.59573867e+001 -1.69729956e+001 -4.37054803e+000 -1.31759063e+001]
 [-9.27074171e+000 -4.57286900e+000 -2.28643256e+001 -7.59049970e+000]
 [-3.09170984e+001 -6.30028571e+000 -2.85479266e+000 -2.43745815e+001]
 [-2.70923235e+002 -2.32047631e+002 -1.82793098e+002 -2.23669452e+001]
 [-3.51885850e+002 -7.72389727e+001 -1.02602455e+003 -3.09639690e+002]
 [-1.9

# The test agent function

In [None]:
def test_agent(policy, episodes=1, epsilon=0.2):
    env.pygame_init()
    for episode in range(episodes):
        state = env.reset()
        done, terminated = False, False
        while not (done or terminated):
            action = policy(state, epsilon)
            next_state, reward, done, terminated = env.step(action)
            frame = env.render()
            state = next_state
        print(episode+1)

In [None]:
test_agent(policy, episodes=1)

1


In [None]:
env.close()