In [1]:
import sys
import numpy as np
import math
import random
import json
import requests

import gym
import gym_maze
from gym_maze.envs.maze_manager import MazeManager
from riddle_solvers import *

import torch
import pygame
import time
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import warnings

warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_name = torch.cuda.get_device_name(torch.cuda.current_device())
device_name

'NVIDIA GeForce RTX 3050 Laptop GPU'

In [3]:
def flat_obv(obv):
    flattened = []
    for dim in obv:
        if isinstance(dim, list):
            for arr in dim:
                if isinstance(arr, list):
                    for item in arr:
                        flattened.append(item)
                else:
                    flattened.append(arr)
        else:
            flattened.append(dim)
    return flattened

In [4]:
class policy(nn.Module):
    def __init__(self, s_size, a_size, h_size1,h_size2) -> None:
        super().__init__() # calling the super class
        
        #Network layers 
        self.L1 = nn.Linear(s_size, h_size1)
        self.L2 = nn.Linear(h_size1, h_size2)
        self.L3 = nn.Linear(h_size2, a_size)
    
    def forward(self, x):
        # the forward pass of the network
        x = F.relu(self.L1(x))
        x = F.relu(self.L2(x))
        x = self.L3(x)
        x = F.softmax(x,dim=1)
        return x
    
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device) # setting the state to tensor with the input shape
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)
    

In [5]:
s_size = 14
a_size = 4

In [6]:
maze_hyper = {
    "h_size1": 16,
    "h_size2": 32,
    "h_size3" : 16,
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 5000,
    "gamma": 1.0,
    "lr": 1e-2,
    "state_space": s_size,
    "action_space": a_size,
}

In [7]:
maze_policy = policy(maze_hyper["state_space"], maze_hyper["action_space"], maze_hyper["h_size2"], maze_hyper["h_size3"]).to(device)

In [8]:
def convert_state(state):
    state0 = state[0]
    state1 = state[1]
    state2 = state[2]
    
    ans = []
    
    for i in state0:
        ans.append(i)
    
    for i in state1:
        ans.append(i)
    
    for i in state2:
        for j in i:
            ans.append(j)
    
    return ans

In [9]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    
    actions = ['N', 'S', 'E', 'W']

    sample_maze = np.load("hackathon_sample.npy")
    agent_id = "9" # add your agent id here
    
    manager = MazeManager()
    manager.init_maze(agent_id, maze_cells = sample_maze)
    env = manager.maze_map[agent_id]

    riddle_solvers = {'cipher': cipher_solver, 'captcha': captcha_solver, 'pcap': pcap_solver, 'server': server_solver}
    maze = {}
    states = {}

    
    maze['maze'] = env.maze_view.maze.maze_cells.tolist()
    maze['rescue_items'] = list(manager.rescue_items_dict.keys())

    MAX_T = 5000
    RENDER_MAZE = True
    

    with open("./states.json", "w") as file:
        json.dump(states, file)

    
    with open("./maze.json", "w") as file:
        json.dump(maze, file)

    
    for i_episode in range(1, n_training_episodes):
        state = manager.reset(agent_id)
        num_of_actions = 0
        saved_log_probs = []
        rewards= []
        rid_recused = [0,0,0,0]
        state = convert_state(state)
        state = np.array(state)
        
        for t in range(MAX_T):
            
            num_of_actions+=1

            
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            action = actions[action]
            
            state, reward, done, truncated, info = manager.step(agent_id, action)
            
            reward = 1
            
            state = convert_state(state)
            state = np.array(state)
            
            if state[2] == 0 and rid_recused[0] == 0 :
                rid_recused[0] = 1
                reward -= 1000 
                
                
            if state[3] == 0 and rid_recused[1] == 0:
                rid_recused[1] = 1
                reward -= 1000 
                
                
            if state[4] == 0 and rid_recused[2] == 0:
                rid_recused[2] == 1
                reward -= 1000 
                
                
            if state[5] == 0 and rid_recused[3]  == 0 :
                rid_recused[3] = 1
                reward -= 1000 
            
            manager.render(agent_id)


            rewards.append(reward)
            saved_log_probs.append(log_prob)

            if done:
                break
        
            
            
        total_reward = sum(rewards)
        if state[0] != 9 or state[1] != 9:
            NotFinished = True
            
        eps = np.finfo(np.float32).eps.item() # getting the smallest positive float
        rewards = torch.tensor(rewards, dtype=float)
        rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
        
        
        policy_loss = []
        
        for action_log_prob, disconted_reward in zip(saved_log_probs, rewards):
            term = action_log_prob * disconted_reward
            if NotFinished:
                policy_loss.append(term * 1.2)
            else:
                policy_loss.append(term)
        
        policy_loss = torch.cat(policy_loss).sum() 
        
        print(policy_loss)
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        
        
        
        print('Episode {}\t Score {}'.format(i_episode, total_reward))

In [10]:
optimizer = optim.SGD(maze_policy.parameters(), lr=0.01)

In [11]:
reinforce(          maze_policy,
                    optimizer,
                    maze_hyper["n_training_episodes"], 
                    maze_hyper["max_t"],
                    maze_hyper["gamma"], 
                    100)

tensor(-35.8914, grad_fn=<SumBackward0>)
Episode 1	 Score 4000
tensor(0., grad_fn=<SumBackward0>)
Episode 2	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 3	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 4	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 5	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 6	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 7	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 8	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 9	 Score 5000
tensor(0., grad_fn=<SumBackward0>)
Episode 10	 Score 5000


KeyboardInterrupt: 

In [281]:
torch.save(maze_policy, "SecondModel.pth")

In [282]:
loaded_state_dict = torch.load('FirstModel.pth')
model = policy()
model.load_state_dict(loaded_state_dict)

TypeError: policy.__init__() missing 4 required positional arguments: 's_size', 'a_size', 'h_size1', and 'h_size2'

In [210]:
actions = ['N', 'S', 'E', 'W']

sample_maze = np.load("hackathon_sample.npy")
agent_id = "9" # add your agent id here
    
manager = MazeManager()
manager.init_maze(agent_id, maze_cells = sample_maze)
env = manager.maze_map[agent_id]

riddle_solvers = {'cipher': cipher_solver, 'captcha': captcha_solver, 'pcap': pcap_solver, 'server': server_solver}
maze = {}
states = {}

    
maze['maze'] = env.maze_view.maze.maze_cells.tolist()
maze['rescue_items'] = list(manager.rescue_items_dict.keys())

MAX_T = 5000
RENDER_MAZE = True
    

with open("./states.json", "w") as file:
    json.dump(states, file)

    
with open("./maze.json", "w") as file:
    json.dump(maze, file)
    
    state = manager.reset(agent_id)
    
    for i in range(5000):
        state = convert_state(state)
        state = np.array(state)
        action, log_prob = maze_policy.act(state=state)
        action = actions[action]
        print(action)
        state, reward, done, truncated, info = manager.step(agent_id, action)
        if RENDER_MAZE:
            manager.render(agent_id)
        time.sleep(.5)

N
N
E
N
E
S
W
N
E
E
W
W
N
S
S
W
S
W
N
N
W
S
W
W
W
W
W
S
W
N
W
S
N
W
E
S
S
W
W
W
W
S
S
E
E
S
W
W
W
S
N
W
S
W
W
S
W
W
N
W
N
S
S
W
W
W
W
N
S
W


KeyboardInterrupt: 

In [1]:
pygame.quit()

NameError: name 'pygame' is not defined