In [1]:
import gymnasium as gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import random
import torch
from torch import nn
import torch.nn.functional as F
import wandb

class DeepQLearningNetwork(nn.Module):
    def __init__(self, input_state, hidden_layer, output_state):
        super().__init__()
        self.first_layer = nn.Linear(input_state, hidden_layer)  
        self.last_layer = nn.Linear(hidden_layer, output_state) 

    def forward(self, x):
        x = F.relu(self.first_layer(x))
        x = self.last_layer(x)        
        return x

class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)
    
    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)
    

    """
    HER yöntemi orjinal deneyimdeki hedefi ulaşılan hedefle değiştirerek yeni deneyimler oluşturmak için kullanılır.
    Daha çeşitli ve anlamlı deneyimler sağlayarak öğrenme sürecinin iyileştirilmesine yardımcı olur.

    sample_size: Örneklenecek deneyim sayısı.

    Returned:
    list: Oluşturulan yeni deneyimlerin bir listesi.
    """
    def HER(self, sample_size):
        experiences = self.sample(sample_size)
        list_experiences = []
        
        # Örneklenen deneyimler üzerinde iteration işlemi gerçekleştiriyoruz.
        for state, action, next_state, reward, done, goal in experiences:
            goal = next_state # Yeni goal = achieved goal.
            
            if next_state == goal:
                HER_reward = 1.0
            else:
                HER_reward = 0.0
            
            list_experiences.append((state, action, next_state, HER_reward, done, goal)) # Yeni deneyimi listeye ekliyoruz.
        
        # Orjinal ve yeni deneyimleri return ediyoruz.
        return experiences + list_experiences
    
    def __len__(self):
        return len(self.memory)

class DeepQLearning():
    BATCH_SIZE = 32 
    BUFFER_SIZE = 1000 
    LR = 0.001
    DISCOUNT = 0.9    
    SYNC_STEPS = 10 
    loss_function = nn.MSELoss() 
    optimizer = None
    actions = ["Left", "Down", "Right", "Up"]

    wandb.init(project="frozen_lake_project_HER", config={"BATCH_SIZE": BATCH_SIZE, "LR": LR, "DISCOUNT": DISCOUNT, "SYNC_STEPS": SYNC_STEPS})

    def train(self, episodes, is_slippery=False):
        
        env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=is_slippery, render_mode="human")
        number_of_states = env.observation_space.n
        number_of_actions = env.action_space.n
        
        EPS = 1 
        memory = ReplayMemory(self.BUFFER_SIZE)

        policy_dqn = DeepQLearningNetwork(number_of_states * 2, number_of_states, number_of_actions)
        target_dqn = DeepQLearningNetwork(number_of_states * 2, number_of_states, number_of_actions)

        target_dqn.load_state_dict(policy_dqn.state_dict())

        self.optimizer = torch.optim.SGD(policy_dqn.parameters(), lr=self.LR)

        global returns_per_episode
        returns_per_episode = np.zeros(episodes)
        successful_episodes = 0

        epsilon_history = []

        step_count=0
            
        for i in range(episodes):
            state = env.reset()[0]
            goal = np.random.randint(0, number_of_states)
            terminated = False     
            truncated = False
            episode_return = 0
            episode_steps = 0

            while(not terminated and not truncated):

                if random.random() < EPS:
                    action = env.action_space.sample() 
                else:
                    with torch.no_grad():
                        state_goal = self.state_goal_tensor(state, goal, number_of_states)
                        action = policy_dqn(state_goal).argmax().item()

                new_state, reward, terminated, truncated, _ = env.step(action)
                memory.append((state, action, new_state, reward, terminated, goal))
                state = new_state
                episode_return += reward
                step_count += 1
                episode_steps += 1

            returns_per_episode[i] = episode_return
            if episode_return > 0:
                successful_episodes += 1

            #print(f"Episode {i + 1}: {episode_steps} steps")
            wandb.log({"Episode": i + 1, "Steps": episode_steps})

            if len(memory) > self.BATCH_SIZE:
                mini_batch = memory.HER(self.BATCH_SIZE)
                self.optimize(mini_batch, policy_dqn, target_dqn)   

                EPS = max(EPS - 1 / episodes, 0)
                epsilon_history.append(EPS)

                if step_count > self.SYNC_STEPS:
                    target_dqn.load_state_dict(policy_dqn.state_dict())
                    step_count = 0

        env.close()

        torch.save(policy_dqn.state_dict(), "best_model.pt")

        sum_returns = np.zeros(episodes)

    def optimize(self, mini_batch, policy_dqn, target_dqn):

        number_of_states = policy_dqn.first_layer.in_features // 2

        current_q_list = []
        target_q_list = []

        for state, action, new_state, reward, terminated, goal in mini_batch:

            if terminated: 
                target = torch.FloatTensor([reward])
            else:
                with torch.no_grad():
                    target = torch.FloatTensor(
                        reward + self.DISCOUNT * target_dqn(self.state_goal_tensor(new_state, goal, number_of_states)).max())

            current_q = policy_dqn(self.state_goal_tensor(state, goal, number_of_states))
            current_q_list.append(current_q)

            target_q = target_dqn(self.state_goal_tensor(state, goal, number_of_states)) 
            target_q[action] = target
            target_q_list.append(target_q)

        loss = self.loss_function(torch.stack(current_q_list), torch.stack(target_q_list))
        
        global accuracy
        accuracy = (1-loss) * 100

        #print(f"Accuracy: %{accuracy:.2f}")
        #print("--------------------------")

        wandb.log({"Accuracy": accuracy})

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def state_goal_tensor(self, state:int, goal:int, number_of_states:int):
        input_tensor = torch.zeros(number_of_states * 2)
        input_tensor[state] = 1
        input_tensor[number_of_states + goal] = 1
        return input_tensor

    def test(self, episodes, is_slippery=False):
        
        env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=is_slippery, render_mode="rgb_array")
        number_of_states = env.observation_space.n
        number_of_actions = env.action_space.n

        policy_dqn = DeepQLearningNetwork(number_of_states * 2, number_of_states, number_of_actions) 
        policy_dqn.load_state_dict(torch.load("best_model.pt"))
        policy_dqn.eval()  

        for i in range(episodes):
            state = env.reset()[0]  
            goal = np.random.randint(0, number_of_states)
            terminated = False    
            truncated = False                

            while(not terminated and not truncated):  
                with torch.no_grad():
                    action = policy_dqn(self.state_goal_tensor(state, goal, number_of_states)).argmax().item()

                state, reward, terminated, truncated, _ = env.step(action)

        env.close()

model = DeepQLearning()
train_episode_count = 500
test_episode_count = 50
slippery = True

model.train(train_episode_count, is_slippery=slippery)
model.test(test_episode_count, is_slippery=slippery)

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Episode 1: 21 steps
Episode 2: 17 steps
Accuracy: %83.94
--------------------------
Episode 3: 42 steps
Accuracy: %84.50
--------------------------
Episode 4: 58 steps
Accuracy: %84.50
--------------------------
Episode 5: 29 steps
Accuracy: %84.72
--------------------------
Episode 6: 100 steps
Accuracy: %85.30
--------------------------
Episode 7: 33 steps
Accuracy: %84.47
--------------------------
Episode 8: 51 steps
Accuracy: %84.80
--------------------------
Episode 9: 16 steps
Accuracy: %85.48
--------------------------
Episode 10: 20 steps
Accuracy: %85.34
--------------------------
Episode 11: 19 steps
Accuracy: %84.55
--------------------------
Episode 12: 24 steps
Accuracy: %84.14
--------------------------
Episode 13: 13 steps
Accuracy: %84.81
--------------------------
Episode 14: 7 steps
Accuracy: %85.11
--------------------------
Episode 15: 13 steps
Accuracy: %84.52
--------------------------
Episode 16: 67 steps
Accuracy: %85.49
--------------------------
Episode 17: 3