In [1]:
import random
import math
import numpy as np
import gymnasium as gym
import torch

In [8]:
learning_rate = .9
mini_batch_size = 100 
discount_factor = 0.98 
replay_buffer_size = 200

episode = 25
allocated_setps = 30

game_board = ["SFFH","FFFH","HFFF","HFFG"]
goal = 16
show_board = True

env = gym.make('FrozenLake-v1', desc=game_board, map_name="4x4", is_slippery=False, render_mode='human' if show_board else None)

observation_size = env.observation_space.n
action_size = env.action_space.n

In [3]:
epsilon_ending_value  = 0.01
epsilon_decay_value  = 2 / episode
epsilon = 1

In [4]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.capacity = capacity
        self.memory = []

    def push(self,event):
        self.memory.append(event)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        experiences = random.sample(self.memory, k=batch_size)
        return experiences

In [5]:
class Agent():
    def __init__(self,observation_size,action_size):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.memory = ReplayMemory(replay_buffer_size)
        self.board = torch.zeros((observation_size,action_size)).to(self.device)
        self.t_step = 0

    def learn(self, state, experiences, discount_factor):
        for idx, exp in enumerate(experiences):
            self.board[exp[0],exp[1]] = self.board[exp[0],exp[1]] + learning_rate * ( 
                            exp[2] + discount_factor * torch.max(self.board[exp[3]]) - self.board[exp[0],exp[1]] 
                        )

    def step(self, state, action, reward, next_state):
        self.memory.push((state, action, reward, next_state))
        self.t_step  = (self.t_step + 1) % 4
        if self.t_step == 0 and len(self.memory.memory) >= mini_batch_size:
            experiences = self.memory.sample(mini_batch_size)
            self.learn(state, experiences, discount_factor)

    def act(self, state, epsilon = 0):
        best_action = torch.argmax(self.board[state])
        if random.random() < epsilon: 
            return random.choice(np.arange(self.action_size))
        return best_action.item()


In [6]:
agent = Agent(observation_size, action_size)

In [9]:
w_l = []

for episode in range(1, episode + 1):
    steps_taken = 0
    terminated = False

    state = env.reset()[0]
    while(steps_taken<=allocated_setps and not terminated):
        action = agent.act(state, epsilon)

        next_state, reward, terminated, _, _ = env.step(action)

        if terminated and next_state != 15:
            reward = -2
        elif not terminated:
            sx = (next_state % action_size) + 1
            sy = math.floor(next_state/action_size)+1
            gx = (goal % action_size) + 1
            gy = math.floor(goal/action_size)+1
            reward = ((sx+gx)/2+(sy+gy)/2)*.001
            
        agent.step(state, action, reward, next_state)

        state = next_state
        steps_taken += 1

    epsilon = max(epsilon_ending_value, epsilon - epsilon_decay_value)

    if state == 15 and terminated:
        print(f"WIN / State: {state} / Reward: {reward}")
        if episode-1 >= episode-episode*.05:
            w_l.append(1)

    if state != 15 and terminated:
        if episode-1 >= episode-episode*.05:
            w_l.append(0)

    print(f'Episode {episode}\tSteps Taken {steps_taken}\tEpsilon: {epsilon}')
env.close()
print(f'Win Lose Ratio for last 10% of Episodes is {(sum(w_l)/len(w_l))*100}%')
print(agent.board)

WIN / State: 15 / Reward: 1.0
Episode 1	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 2	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 3	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 4	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 5	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 6	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 7	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 8	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 9	Steps Taken 8	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 10	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 11	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 12	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 13	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 14	Steps Taken 6	Epsilon: 0.01
WIN / State: 15 / Reward: 1.0
Episode 15	St