In [1]:
import random
import math
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable

In [2]:
learning_rate = .05 # alpha, 0.0005
mini_batch_size = 500 # number of observations used per step to update the model.
discount_factor = 0.1 # gamma - closer to one the more consideration of future rewards
replay_buffer_size = 2000 # For experience Replay, number of observations in AI memory, 100,000
interpolation_parameter = 1e-3 # 0.001

In [3]:
env = gym.make('FrozenLake-v1', desc=["SFFH","FFFH","HFFF","HFFG"], map_name="4x4", is_slippery=False, render_mode=None)

state_size = env.observation_space.n
action_size = env.action_space.n

In [4]:
class Network(nn.Module):
    def __init__(self, state_size, action_size, seed=42):
        super(Network, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.full_connected_layer_1 = nn.Linear(2,4)
        self.full_connected_layer_2 = nn.Linear(4,4)
        self.full_connected_layer_3 = nn.Linear(4,action_size)

    def forward(self,state,board):
        X = self.full_connected_layer_1(state)
        X = F.relu(X)
        X = self.full_connected_layer_2(X)
        X = F.relu(X)
        return self.full_connected_layer_3(X)

In [5]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.capacity = capacity
        self.memory = []

    def push(self,event):
        self.memory.append(event)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        experiences = random.sample(self.memory, k=batch_size)
        return experiences

In [6]:
class Agent():
    def __init__(self,state_size,action_size):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.local_qnetwork = Network(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
        self.memory = ReplayMemory(replay_buffer_size)
        self.board = torch.zeros((state_size,action_size)).to(self.device)
        self.t_step = 0
        self.l_step = 0

    def learn(self, state, experiences, discount_factor):
        self.l_step = (self.l_step + 1) % 10
        if self.l_step % 10 == 0:
            for idx, exp in enumerate(experiences):
                self.board[exp[0],exp[1]] = self.board[exp[0],exp[1]] + learning_rate * ( 
                                exp[2] + discount_factor * torch.max(self.board[exp[3]]) - self.board[exp[0],exp[1]] 
                            )
        state = torch.tensor(state).float().unsqueeze(0).to(self.device)
        predicted_action = torch.argmax(self.local_qnetwork(state))
        predicted_best = Variable(self.board[int(state.item()), predicted_action.item()], requires_grad=True)
        actual_best = Variable(torch.max(self.board[int(state.item())]), requires_grad=True)

        loss = F.mse_loss(actual_best*10,predicted_best*10)
        self.optimizer.zero_grad()
        loss.backward()
        # print(f"Predicted: {predicted_best.item()}\tActual: {actual_best.item()}\t Loss: {loss.item()}")
        self.optimizer.step()


    def step(self, state, action, reward, next_state):
        self.memory.push((state, action, reward, next_state))
        self.t_step  = (self.t_step + 1) % 4
        if self.t_step == 0 and len(self.memory.memory) >= mini_batch_size:
            experiences = self.memory.sample(mini_batch_size)
            self.learn(state, experiences, discount_factor)

    def act(self, state, epsilon = 0):
        state = torch.tensor(state).float().unsqueeze(0).to(self.device)
        hard_action = torch.argmax(self.board[int(state.item())])
        self.local_qnetwork.eval()
        with torch.no_grad():
            action_values = self.local_qnetwork(state)
        self.local_qnetwork.train()
        # print(f'Actual {hard_action}\nfrom ANN: {torch.argmax(action_values)}')
        return hard_action.item()


In [7]:
agent = Agent(state_size, action_size)

In [8]:
number_episodes = 5000
allocated_setps = 30
epsilon_starting_value  = 1.0
epsilon_ending_value  = 0
epsilon_decay_value  = 5e-4
epsilon = 1
w_l = []

for episode in range(1, number_episodes + 1):
    state = env.reset()[0]
    steps_taken = 0
    total_rewards = 0
    for t in range(allocated_setps):
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = agent.act(state, epsilon)
            # print(f'Action Received: {action}')
        
        next_state, reward, terminated, _, _ = env.step(action)
        
        if terminated and next_state != 15:
            reward = -2
        elif not terminated:
            sx = (next_state % 4) + 1
            sy = math.floor(next_state/4)+1
            gx = (16 % 4) + 1
            gy = math.floor(16/4)+1
            reward = ((sx+gx)/2+(sy+gy)/2)*.001
        agent.step(state, action, reward, next_state)
        state = next_state
        steps_taken += 1
        total_rewards += reward
        if terminated:
            break
    epsilon = max(epsilon_ending_value, epsilon - epsilon_decay_value)
    if state == 15 and terminated:
        print(f"WIN / State: {state} / Reward: {reward}")
        if episode-1 >= number_episodes-number_episodes*.1:
            w_l.append(1)
    if state != 15 and terminated:
        if episode-1 >= number_episodes-number_episodes*.1:
            w_l.append(0)
    print(f'Episode {episode}\tSteps Taken {steps_taken}\tEpsilon: {epsilon}\tRewards: {total_rewards}')
env.close()
print(f'Win Lose Ratio for last 10% of Episodes is {(sum(w_l)/len(w_l))*100}%')
print(agent.board)

Episode 1	Steps Taken 2	Epsilon: 0.9995	Rewards: -1.9955
Episode 2	Steps Taken 12	Epsilon: 0.9990000000000001	Rewards: -1.949
Episode 3	Steps Taken 5	Epsilon: 0.9985000000000002	Rewards: -1.983
Episode 4	Steps Taken 2	Epsilon: 0.9980000000000002	Rewards: -1.9955
Episode 5	Steps Taken 7	Epsilon: 0.9975000000000003	Rewards: -1.971
Episode 6	Steps Taken 6	Epsilon: 0.9970000000000003	Rewards: -1.9775
Episode 7	Steps Taken 11	Epsilon: 0.9965000000000004	Rewards: -1.9585
Episode 8	Steps Taken 8	Epsilon: 0.9960000000000004	Rewards: -1.971
WIN / State: 15 / Reward: 1.0
Episode 9	Steps Taken 25	Epsilon: 0.9955000000000005	Rewards: 1.122
Episode 10	Steps Taken 2	Epsilon: 0.9950000000000006	Rewards: -1.9955
Episode 11	Steps Taken 6	Epsilon: 0.9945000000000006	Rewards: -1.9765
Episode 12	Steps Taken 2	Epsilon: 0.9940000000000007	Rewards: -1.9955
Episode 13	Steps Taken 4	Epsilon: 0.9935000000000007	Rewards: -1.986
Episode 14	Steps Taken 10	Epsilon: 0.9930000000000008	Rewards: -1.9595
Episode 15	Ste