In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
import numpy as np
from mlagents_envs.environment import ActionTuple

# env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
env =  UE(file_name=r"C:\Users\linzj\Desktop\environment\S4\\build",seed=4,side_channels=[],worker_id=1,no_graphics = False)
env.reset()

In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

vision_output_dim = 3136
num_words = 44  # Number of unique words in the vocabulary
language_output_dim = 128
embedding_dim = 128
mixing_dim = 256
lstm_hidden_dim = 256
num_actions = 4

# (3,128,128) --> (64,7,7) = 3136 (3-layer CNN)
class VisualModule(nn.Module): 
    def __init__(self):
        super(VisualModule, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=3, padding=0),
            nn.ReLU()
        )
        # self.conv = nn.Sequential(
        #     nn.Conv2d(3, 32, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2),
        #     nn.ReLU(),
        #     nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        #     nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        # )

    def forward(self, vt):
        encoded_vt = self.conv(vt)
        return encoded_vt.view(vt.size(0), -1).squeeze()

# one-hot encoding [0 0 1 0 0] --> 128 dimensional embedding (FF)
class LanguageModule(nn.Module): 
    def __init__(self, num_words, embedding_dim):
        super(LanguageModule, self).__init__()
        self.embedding = nn.Linear(num_words, embedding_dim)
        # self.embedding = nn.Sequential(
        #             nn.Linear(num_words, embedding_dim),
        #             nn.ReLU())

    def forward(self, lt):
        embedded_lt = self.embedding(lt)
        return embedded_lt

# 3136(vision) + 128 (language) --> 256 dimensional embedding (FF)
class MixingModule(nn.Module):
    def __init__(self, vision_output_dim, language_output_dim, mixing_dim):
        super(MixingModule, self).__init__()
        self.linear = nn.Linear(vision_output_dim + language_output_dim, mixing_dim)

        # self.linear = nn.Sequential(
        #             nn.Linear(vision_output_dim + language_output_dim, mixing_dim),
        #             nn.ReLU())

    def forward(self, vision_output, language_output):
        combined_output = torch.cat((vision_output, language_output), dim=0)
        mixed_output = self.linear(combined_output)
        return mixed_output

class LSTMModule(nn.Module):
    def __init__(self,mixing_dim,lstm_hidden_dim):
        super(LSTMModule, self).__init__()
        self.lstm = nn.LSTMCell(mixing_dim, lstm_hidden_dim)
    
    def forward(self,mixed_output,lstm_hidden_state):
        lstm_hidden_state = self.lstm(mixed_output, lstm_hidden_state) 
        # lstm_output = lstm_hidden_state[0] # output is (hidden_state,cell_state), we need hidden state, shape (1,256)
        return lstm_hidden_state

class Agent(nn.Module):
    def __init__(self, num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions):
        super(Agent, self).__init__()
        self.language_module = LanguageModule(num_words, embedding_dim)
        self.visual_module = VisualModule()
        self.mixing_module = MixingModule(vision_output_dim, language_output_dim, mixing_dim)
        self.lstm_module = LSTMModule(mixing_dim, lstm_hidden_dim)
        self.action_predictor = nn.Linear(lstm_hidden_dim, num_actions)
        self.value_estimator = nn.Linear(lstm_hidden_dim, 1)

    def forward(self, vt, lt, lstm_hidden_state):
        vision_output = self.visual_module(vt)
        language_output = self.language_module(lt)
        mixed_output = self.mixing_module(vision_output, language_output).unsqueeze(0)
        lstm_output = self.lstm_module(mixed_output,lstm_hidden_state)
        action_probs = self.action_predictor(lstm_output[0]) 
        value_estimate = self.value_estimator(lstm_output[0])
        return action_probs,value_estimate,lstm_output
        
        
    def save(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join(r'C:\Users\linzj\Desktop\model', '_'.join([ALG_NAME, ENV_ID]))
        if not os.path.exists(path):
            os.makedirs(path)
        torch.save(self.state_dict(), os.path.join(path, f'agent_{episode}.pt'))

    def load(self, episode, ALG_NAME, ENV_ID):
        path = os.path.join(r'C:\Users\linzj\Desktop\model', '_'.join([ALG_NAME, ENV_ID]))
        self.load_state_dict(torch.load(os.path.join(path, f'agent_{episode}.pt')))    

        


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#-1(20000): -5 # -2.5 (10000): -6.73  #-5 (5000):-9.46  # -7.5 (5000): -7.02  #-10 (5000):-10(circle same place)
# S1_4:12996      S1_13:6495           S1_14:4587        S1_15: 4073            S1_16:3085
from torch.distributions import Categorical
device = torch.device("cpu")
episode = 204493
ALG_NAME = 'S4'
ENV_ID = '6'
speed = 3
MAX_STEPS = 500
TEST_EPISODES = 1000
tracked_agent = -1
agent = Agent(num_words, embedding_dim, vision_output_dim, language_output_dim, mixing_dim, lstm_hidden_dim,num_actions)
agent.load(episode,ALG_NAME,ENV_ID)
average = 0
hashmap = {
    0: 'A',
    1: 'Few',
    2: 'Some',
    3: 'Many',
    4: 'This',
    5: 'That',
    6: 'These',
    7: 'Those',
    8: 'red',
    9: 'green',
    10: 'blue',
    11: 'yellow',
    12: 'black',
    13: 'capsule',
    14: 'cube',
    15: 'cylinder',
    16: 'prism',
    17: 'sphere',
    18: 'Above',
    19: 'Below',
    20: 'In front of',
    21: 'Behind',
    22: 'Beside',
    23: 'On',
    24: 'Between',
    25: 'Among',
    26: 'A',
    27: 'Few',
    28: 'Some',
    29: 'Many',
    30: 'This',
    31: 'That',
    32: 'These',
    33: 'Those',
    34: 'red',
    35: 'green',
    36: 'blue',
    37: 'yellow',
    38: 'black',
    39: 'capsule',
    40: 'cube',
    41: 'cylinder',
    42: 'prism',
    43: 'sphere'}
success = 0
for episode in range(TEST_EPISODES):
            STEPS = 0
            episode_reward = 0
            behavior_name=list(env.behavior_specs)[0]
            spec=env.behavior_specs[behavior_name]
            # state = env.reset().astype(np.float32)
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            # state -- vt, lt, lstm
            vt = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
            index1 = int(decision_steps.obs[1][0][0])   #determiner1
            index2 = int(decision_steps.obs[1][0][1])+8 #Color1
            index3 = int(decision_steps.obs[1][0][2])+13 #Shape1
            index4 = int(decision_steps.obs[1][0][3])+18 #preposition
            index5 = int(decision_steps.obs[1][0][4])+26 #determiner2
            index6 = int(decision_steps.obs[1][0][5])+34 #Color2
            index7 = int(decision_steps.obs[1][0][6])+39 #Shape2
            print(f'---{hashmap[index1]} {hashmap[index2]} {hashmap[index3]} {hashmap[index4]} {hashmap[index5]} {hashmap[index6]} {hashmap[index7]}---')
            # 0-capsule,1-cube,2-cylinder,3-prism,4-sphere 
            lt = torch.zeros(44).to(device)
            lt[index1],lt[index2], lt[index3], lt[index4], lt[index5],lt[index6], lt[index7] = 1,1,1,1,1,1,1
            lstm_hidden_state = (torch.zeros(1, lstm_hidden_dim).to(device), torch.zeros(1, lstm_hidden_dim).to(device))
            done = False
            while not done:
                STEPS += 1                
                lstm_hidden_state = tuple(tensor.detach() for tensor in lstm_hidden_state)
                policy_dist, value, lstm_hidden_state = agent(vt,lt,lstm_hidden_state)
                # value = value.detach()
                dist = F.softmax(policy_dist.detach(),dim=1).cpu().numpy()
                

                action_dist = Categorical(F.softmax(policy_dist.detach(),dim=1))
                # action_dist = Categorical(F.softmax(policy_dist,dim=1))
                action = action_dist.sample() # sample an action from action_dist
                action_onehot = F.one_hot(torch.tensor(action),num_actions).cpu()

                discrete_actions = np.array(action_onehot).reshape(1,4)*speed
                action_tuple = ActionTuple()
                action_tuple.add_discrete(discrete_actions)
                env.set_actions(behavior_name,action_tuple)
                env.step()
                decision_steps, terminal_steps = env.get_steps(behavior_name)

                if tracked_agent == -1 and len(decision_steps) >= 1:
                    tracked_agent = decision_steps.agent_id[0]
                    # print(tracked_agent)

                if tracked_agent in terminal_steps: # roll over or hit the target
                    print('Agent in terminal steps')
                    done = True
                    reward = terminal_steps[tracked_agent].reward
                    if reward > 0:
                        pass
                    else: reward = -1 # roll over or other unseen conditions

                    print(f'Terminal Step reward: {reward}')

                elif tracked_agent in decision_steps: # the agent which requires action
                    reward = decision_steps[tracked_agent].reward
                    # print(f'Decision Step reward: {reward}')
                    # if reward<0:
                    #     print(f'Decision Step reward: {reward}')

                if STEPS >= MAX_STEPS:
                        reward = -10
                        print(f'Max Step Reward: {reward}')
                        env.reset()
                        done = True

                episode_reward = episode_reward + reward
                vt_new = torch.tensor(decision_steps.obs[0]).reshape(1,3,128,128).to(device)
                vt = vt_new
            # if episode_reward < -10: episode_reward = -10
            if episode_reward==10: success += 1
            average += episode_reward / TEST_EPISODES
            print(f'Episode: {episode}, Episode reward: {episode_reward}')
print(f'Average Episode Reward: {average}, Success Rate: {success}/{TEST_EPISODES}')


---Many red prism Below A black sphere---




Agent in terminal steps
Terminal Step reward: 10.0
Episode: 0, Episode reward: 10.0
---This blue cube Behind A yellow prism---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 1, Episode reward: 10.0
---Few red capsule Among These yellow prism---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 2, Episode reward: 10.0
---Many green cylinder In front of These black cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 3, Episode reward: 10.0
---Few green cube In front of Some green sphere---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 4, Episode reward: 10.0
---A black cube Beside Some blue cube---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 5, Episode reward: 10.0
---These green cube Below Many blue cylinder---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 6, Episode reward: 10.0
---That green cylinder Above A red capsule---
Agent in terminal steps
Terminal Step reward: 10.0
Episode: 7, Episode reward