In [1]:
import sys
import numpy as np
import math
import random
import json
import requests

import gym
import gym_maze
from gym_maze.envs.maze_manager import MazeManager
from riddle_solvers import *

import torch
import pygame
import time
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_name = torch.cuda.get_device_name(torch.cuda.current_device())
device_name

'NVIDIA GeForce RTX 3050 Laptop GPU'

In [3]:
def flat_obv(obv):
    flattened = []
    for dim in obv:
        for arr in dim:
            if isinstance(arr, list):
                for item in arr:
                    flattened.append(item)
            else:
                flattened.append(arr)
    return flattened

In [4]:
class policy(nn.Module):
    def __init__(self, s_size, a_size, h_size1,h_size2) -> None:
        super().__init__() # calling the super class
        
        #Network layers 
        self.L1 = nn.Linear(s_size, h_size1)
        self.L2 = nn.Linear(h_size1, h_size2)
        self.L3 = nn.Linear(h_size2, a_size)
    
    def forward(self, x):
        # the forward pass of the network
        x = F.relu(self.L1(x))
        x = F.relu(self.L2(x))
        x = self.L3(x)
        x = F.softmax(x,dim=1)
        return x
    
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device) # setting the state to tensor with the input shape
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)
    

In [5]:
s_size = 14
a_size = 4

In [6]:
maze_hyper = {
    "h_size1": 16,
    "h_size2": 32,
    "h_size3" : 16,
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 5000,
    "gamma": 1.0,
    "lr": 1e-2,
    "state_space": s_size,
    "action_space": a_size,
}

In [7]:
maze_policy = policy(maze_hyper["state_space"], maze_hyper["action_space"], maze_hyper["h_size2"], maze_hyper["h_size3"]).to(device)

In [17]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    
    actions = ['N', 'S', 'E', 'W']
    sample_maze = np.load("hackathon_sample.npy")
    agent_id = "9" # add your agent id here
    
    manager = MazeManager()
    manager.init_maze(agent_id, maze_cells = sample_maze)
    env = manager.maze_map[agent_id]

    
    for i_episode in range(1, n_training_episodes+1):
        state = manager.reset(agent_id)
        num_of_actions = 0
        for t in range(max_t):
            state = flat_obv(state)
            num_of_actions+=1
            state = np.array(state)
            action, log_prob = policy.act(state)
            action = actions[action]
            state, reward, done, truncated, info = manager.step(agent_id, action)
            
            if done:
                break
        state = flat_obv(state)
        print(state)
        num_of_reached_riddles  = 0
        
        for i in range(2,6):
            if state[i] == -1:
                num_of_reached_riddles += 1
        
        score = (num_of_reached_riddles * 1000) / num_of_actions
        
        if state[0] != 9 or state[1] != 9:
            score *= .8
        
        policy_loss = -score

        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode / print_every == 0 :
            print(score)

In [18]:
maze_optimizer = optim.Adam(maze_policy.parameters(), lr=maze_hyper["lr"])

In [19]:
reinforce(          maze_policy,
                    maze_optimizer,
                    maze_hyper["n_training_episodes"], 
                    maze_hyper["max_t"],
                    maze_hyper["gamma"], 
                    100)

[6, 0, 9, 9, 7, 12, 1, 1, -1, 1, -1, 1, -1, 1]


AttributeError: 'float' object has no attribute 'backward'

: 