CHATGPT PROMPT:
can you create a gym environment for 6X6 grid with start, goal and walls with a gui for rendering

In [7]:
import numpy as np
import gym
from gym import spaces
import tkinter as tk
import time

class GridWorldEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        self.shape = (4,4)
        self.start_state = (0, 0)
        self.goal_state = (3,3)
        self.walls = [(2, 2), (3, 2), (4, 2), (1, 4), (2, 4), (3, 4)]
        self.current_state = self.start_state
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Discrete(self.shape[0] * self.shape[1])

    def reset(self):
        self.current_state = self.start_state
        return self.current_state

    def step(self, action):
        row, col = self.current_state
        if action == 0:  # up
            row = max(row - 1, 0)
        elif action == 1:  # down
            row = min(row + 1, self.shape[0] - 1)
        elif action == 2:  # left
            col = max(col - 1, 0)
        elif action == 3:  # right
            col = min(col + 1, self.shape[1] - 1)
        self.current_state = (row, col)
        reward = self._get_reward()
        done = self._is_done()
        info = {}
        return self.current_state, reward, done, info

    def _get_reward(self):
        if self.current_state == self.goal_state:
            return 1
        #elif self.current_state in self.walls:
        #    return -1
        else:
            return -1

    def _is_done(self):
        return self.current_state == self.goal_state 

    def render(self, mode='human'):
        if mode == 'human':
            if not hasattr(self, 'root'):
                self.root = tk.Tk()
                self.canvas = tk.Canvas(self.root, width=300, height=300)
                self.canvas.pack()
                self.root.update()
            self.canvas.delete('all')
            cell_width = 300 // self.shape[1]
            cell_height = 300 // self.shape[0]
            for i in range(self.shape[0]):
                for j in range(self.shape[1]):
                    x1 = j * cell_width
                    y1 = i * cell_height
                    x2 = (j + 1) * cell_width
                    y2 = (i + 1) * cell_height
                    fill_color = 'white'
                    if (i, j) == self.current_state:
                        fill_color = 'green'
                    elif (i, j) == self.goal_state:
                        fill_color = 'blue'
                    #elif (i, j) in self.walls:
                    #    fill_color = 'gray'
                    self.canvas.create_rectangle(x1, y1, x2, y2, fill=fill_color)
            #time.sleep(0.1)
            self.root.update()


In [3]:
env = GridWorldEnv()
obs = env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    env.render()

CHATGPT PROMPT:what rl models can be applied in this environment\

response:The grid world environment described can be used to train and test a variety of reinforcement learning algorithms, including but not limited to:

Q-learning: A model-free algorithm that learns the optimal action-value function by iteratively updating the Q-values based on the rewards obtained and the estimated future rewards.

SARSA (State-Action-Reward-State-Action): Another model-free algorithm that learns the optimal policy by estimating the expected rewards for taking an action in a given state and following a policy thereafter.
Policy gradient methods: These are model-free algorithms that optimize the policy directly to maximize the expected cumulative reward.

Actor-critic methods: These are model-based algorithms that use a combination of value-based and policy-based methods to learn the optimal policy.

Deep reinforcement learning algorithms: These are extensions of the above algorithms that incorporate deep neural networks to learn the Q-values, policies, or both.

The choice of the algorithm depends on the specific requirements of the problem, such as the complexity of the environment, the dimensionality of the state and action spaces, the availability of a simulator or model, and the desired performance metrics.

CHATGPT PROMPT: can you write a sarasa program for the 6X6 grid envirnoment you created

In [8]:
import numpy as np
import gym

env = GridWorldEnv()  # create the environment

# SARSA parameters
alpha = 0.1
gamma = 0.9
epsilon = 0.1

# Q table initialization
q_table = np.zeros((env.observation_space.n, env.action_space.n))

# SARSA algorithm
for episode in range(70):
    state = env.reset()
    action = env.action_space.sample()  # choose random action
    done = False
    while not done:
        next_state, reward, done, info = env.step(action)
        s=state[0]*4+state[1]
        s1=next_state[0]*4+next_state[1]
        if np.random.uniform(0, 1) < epsilon:
            next_action = env.action_space.sample()
        else:
            next_action=np.argmax(q_table[s1,:])%4
        q_table[s,action] = q_table[s,action] + alpha * (reward + (gamma * q_table[s1,next_action]) - q_table[s,action])
        env.render()
        state, action = next_state, next_action

# Evaluate the learned policy



In [9]:
print(q_table)

[[-2.70929795 -2.5765467  -2.78906422 -2.55943119]
 [-1.99599665 -2.00345606 -2.02746569 -2.01152485]
 [-1.53772699 -1.46437193 -1.53446727 -1.50589527]
 [-1.16492607 -1.01716845 -1.1147356  -1.04361337]
 [-2.00763724 -2.03227034 -2.14450133 -2.03322799]
 [-1.62850636 -1.60289093 -1.67199946 -1.60424485]
 [-1.09042087 -1.04199204 -1.03947216 -1.04158663]
 [-0.52442392 -0.18844946 -0.45069555 -0.69074998]
 [-1.59915403 -1.49674577 -1.60390818 -1.53647656]
 [-1.26095098 -1.04563157 -1.1877521  -1.02248816]
 [-0.46624681 -0.38820351 -0.44543984 -0.19947997]
 [-0.1         0.98922474 -0.12675768  0.        ]
 [-1.1182417  -1.01658033 -1.1711956  -1.04246665]
 [-0.51606753 -0.64145337 -0.52991512 -0.27928901]
 [-0.21559265 -0.19767718 -0.199       0.94185026]
 [ 0.          0.          0.          0.        ]]


: 