In [15]:
import sys
import numpy as np
import math
import random
import json
import requests

import gym
import gym_maze
from gym_maze.envs.maze_manager import MazeManager
from riddle_solvers import *

import torch
import pygame
import time
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import warnings

warnings.filterwarnings('ignore')

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_name = torch.cuda.get_device_name(torch.cuda.current_device())
device_name

'NVIDIA GeForce RTX 3050 Laptop GPU'

In [17]:
def flat_obv(obv):
    flattened = []
    for dim in obv:
        if isinstance(dim, list):
            for arr in dim:
                if isinstance(arr, list):
                    for item in arr:
                        flattened.append(item)
                else:
                    flattened.append(arr)
        else:
            flattened.append(dim)
    return flattened

In [18]:
class ActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16,output_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        policy = self.actor(state)
        value = self.critic(state)
        return policy, value


In [19]:
s_size = 14
a_size = 4

In [20]:
maze_hyper = {
    "h_size1": 16,
    "h_size2": 32,
    "h_size3" : 16,
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 5000,
    "gamma": 1.0,
    "lr": 1e-2,
    "state_space": s_size,
    "action_space": a_size,
}

In [21]:
def convert_state(state):
    state0 = state[0]
    state1 = state[1]
    state2 = state[2]
    
    ans = []
    
    for i in state0:
        ans.append(i)
    
    for i in state1:
        ans.append(i)
    
    for i in state2:
        for j in i:
            ans.append(j)
    
    return ans

In [22]:
sample_maze = np.load("hackathon_sample.npy")
agent_id = "9" # add your agent id here
    
manager = MazeManager()
manager.init_maze(agent_id, maze_cells = sample_maze)
env = manager.maze_map[agent_id]

In [23]:
input_dim = s_size
output_dim = a_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ActorCritic(input_dim, output_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=.002)

In [24]:
actionss = ['N', 'S', 'E', 'W']
def a2c(env, num_episodes, gamma, lr):
    for i in range(num_episodes):
        moves_count = {'W': 0, 'S' : 0, 'N': 0, 'E' :0}
        obs = manager.reset(agent_id)
        done = False
        rewards = []
        states = []
        actions = []
        rid_recused = []
        obs = convert_state(obs)
        obs = np.array(obs)
        steps_count = 0
        while not done and steps_count < 5001:
            manager.render(agent_id)
            time.sleep(.1)
            steps_count+= 1
            states.append(obs)
            policy, value = model(torch.tensor(obs).float().to(device))
            action_probs = policy.detach().cpu().numpy()
            action = np.random.choice(output_dim, p=action_probs)
            pick_action = actionss[action]
            moves_count[pick_action]+=1
            obs, reward, done, _, info = manager.step(agent_id,pick_action)
            obs = convert_state(obs)
            obs = np.array(obs)
            actions.append(action)
            reward = 0
            if info['riddle_type'] not in rid_recused:
                rid_recused.append(info['riddle_type'])
                reward = 1000
            rewards.append(reward)
        reward_sum = sum(rewards)
        print(f"\nEpisode num : {i}, reward is : {reward_sum}, and moves count is {moves_count}")
        R = 0
        returns = []
        for r in rewards[::-1]:
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns).float().to(device)
        states = torch.tensor(states).float().to(device)
        actions = torch.tensor(actions).long().to(device)
        policy, value = model(states)
        advantage = returns - value.detach().squeeze()
        critic_loss = advantage.pow(2).mean()
        actor_loss = -torch.log(policy.gather(1, actions.unsqueeze(1)).squeeze() + 1e-10) * advantage.detach()
        actor_loss = actor_loss.mean()
        loss = actor_loss + critic_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [25]:
a2c(env, 100, 256, 1, .001)


Episode num : 0, reward is : 5000, and moves count is {'W': 1093, 'S': 1288, 'N': 1434, 'E': 1186}


KeyboardInterrupt: 

In [29]:
actions = ['N', 'S', 'E', 'W']

sample_maze = np.load("hackathon_sample.npy")
agent_id = "9" # add your agent id here
    
manager = MazeManager()
manager.init_maze(agent_id, maze_cells = sample_maze)
env = manager.maze_map[agent_id]

riddle_solvers = {'cipher': cipher_solver, 'captcha': captcha_solver, 'pcap': pcap_solver, 'server': server_solver}
maze = {}
states = {}

    
maze['maze'] = env.maze_view.maze.maze_cells.tolist()
maze['rescue_items'] = list(manager.rescue_items_dict.keys())

MAX_T = 5000
RENDER_MAZE = True
    

with open("./states.json", "w") as file:
    json.dump(states, file)

    
with open("./maze.json", "w") as file:
    json.dump(maze, file)
    
state = manager.reset(agent_id)
moves_count = {'W': 0, 'S' : 0, 'N': 0, 'E' :0}

for i in range(5000):
    state = convert_state(state)
    state = np.array(state)
    state = torch.tensor(state).float().to(device)
    policy, action_probs = model(state)
    policy = policy.detach().cpu().numpy()
    
    pick_action = actionss[action]
    moves_count[pick_action]+=1
    state, reward, done, truncated, info = manager.step(agent_id, pick_action)
    if RENDER_MAZE:
        manager.render(agent_id)
print(moves_count)

0 rescue items


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [None]:
pygame.quit()

: 

: 