**Adding libraries and dependencies**

In [15]:
import numpy as np
import tensorflow as tf
import pandas as pd
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.base_env import ActionTuple
from collections import deque
import random
from tensorflow.keras import layers, Model
import os
import time

**Establishing connection with Unity environment**

In [16]:
engine_config_channel = EngineConfigurationChannel()

env = UnityEnvironment(
    file_name=None,
    worker_id=0,
    base_port=5004,
    side_channels=[engine_config_channel]
)

In [17]:
env.reset()
behavior_names = list(env.behavior_specs)
behavior_name = behavior_names[0]
behavior_spec = env.behavior_specs[behavior_name]

print(f"Behavior: {behavior_name}")
print(f"Action branches: {behavior_spec.action_spec.discrete_branches}")
print(f"Observation shapes: {[obs.shape for obs in behavior_spec.observation_specs]}")

decision_steps, terminal_steps = env.get_steps(behavior_name)
print(f"Active agents: {len(decision_steps)}")

Behavior: RobotBehaviour?team=0
Action branches: (5,)
Observation shapes: [(36,), (8,)]
Active agents: 1


### Action Space
- **5 discrete actions** corresponding to possible movements:
  - `0`: Do nothing
  - `1`: Go right  
  - `2`: Go left
  - `3`: Go forward
  - `4`: Go backwards

### Observation Space

#### Ray Perception (36 dimensions)
- **36 values** from ML-Agents Ray Perception Sensor 3D component
- Detects sphere tags and distances in the environment

#### Agent State (8 dimensions)
- **8 values** divided as follows:
  - Position: `x`, `y`, `z` coordinates
  - Velocity: `x`, `y`, `z` velocity components  
  - Rotation: `y`-axis rotation (facing direction)
  - Time: Elapsed time since episode start

**Total observation size:** 44

---

In [18]:
action_size = 5
state_size = 44

Preparing a get_state function for model training and inference, that is responsible for getting the agent's state at the current step and returning information which will later be used by the model :
- state (44,) : agent's state at the current step
- reward (float) : reward of the current step
- done (boolean) : whether the episode is terminated at the current step

In [21]:
def get_state():
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    
    if len(decision_steps) > 0:
        obs1 = decision_steps.obs[0][0] #Ray perception sensor observations
        obs2 = decision_steps.obs[1][0] #Agent state observations
        state = np.concatenate([obs1, obs2]) #Combine observations for correct DQN input format
        return state, decision_steps.reward[0], False
    
    elif len(terminal_steps) > 0:
        # Episode ended
        obs1 = terminal_steps.obs[0][0]
        obs2 = terminal_steps.obs[1][0] 
        state = np.concatenate([obs1, obs2])
        return state, terminal_steps.reward[0], True
    
    return None, 0, False


**Making the Deep-Q-Network**

In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(state_size,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(action_size, activation='linear')
])

model.compile(optimizer='adam', loss='mse')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
model.summary()

In [40]:
l1_weights = model.get_weights()[0]
l1_weights[:,0] #weights of the first neuron of the first layer

array([ 0.18638815,  0.22980653, -0.20528173,  0.16089182,  0.15823342,
        0.16693343, -0.02030241,  0.01310329,  0.0308039 , -0.11465297,
       -0.05481113,  0.0300891 ,  0.06479736,  0.13707413,  0.01976277,
        0.15838932,  0.10911919, -0.11471327, -0.09875946,  0.11628775,
        0.01441787,  0.17267524,  0.15704541,  0.03268729,  0.09555544,
       -0.21876311, -0.10274138,  0.09107761,  0.04657273,  0.03278114,
        0.20651777, -0.06495936,  0.02366601,  0.20500664,  0.02424057,
        0.15620904, -0.15711457, -0.18866795, -0.00094914,  0.22571234,
        0.15795566, -0.00803611,  0.2254603 , -0.18097895], dtype=float32)

Running the model with the initial random weights

In [23]:
for step in range(1000):  # Run for 100 steps
    state, reward, done = get_state()
    
    if state is not None:
        q_values = model.predict(np.expand_dims(state, axis=0), verbose=0)
        action = np.argmax(q_values[0])  # Choose best action
        
        print(f"Step {step}: Action {action}, Reward {reward:.2f}")
        
        # Send action to Unity
        action_tuple = ActionTuple(discrete=np.array([[action]]))
        env.set_actions(behavior_name, action_tuple)
        env.step()
        
        if done:
            env.reset()
    

Step 0: Action 0, Reward 0.00
Step 1: Action 0, Reward 0.00
Step 2: Action 0, Reward 0.00
Step 3: Action 0, Reward 0.00
Step 4: Action 0, Reward 0.00
Step 5: Action 0, Reward 0.00
Step 6: Action 0, Reward 0.00
Step 7: Action 0, Reward 0.00
Step 8: Action 0, Reward 0.00
Step 9: Action 0, Reward 0.00
Step 10: Action 0, Reward 0.00
Step 11: Action 0, Reward 0.00
Step 12: Action 0, Reward 0.00
Step 13: Action 0, Reward 0.00
Step 14: Action 0, Reward 0.00
Step 15: Action 0, Reward 0.00
Step 16: Action 0, Reward 0.00
Step 17: Action 0, Reward 0.00
Step 18: Action 0, Reward 0.00
Step 19: Action 0, Reward 0.00
Step 20: Action 0, Reward 0.00
Step 21: Action 0, Reward 0.00
Step 22: Action 0, Reward 0.00
Step 23: Action 0, Reward 0.00
Step 24: Action 0, Reward 0.00
Step 25: Action 0, Reward 0.00
Step 26: Action 0, Reward 0.00
Step 27: Action 0, Reward 0.00
Step 28: Action 0, Reward 0.00
Step 29: Action 0, Reward 0.00
Step 30: Action 0, Reward 0.00
Step 31: Action 0, Reward 0.00
Step 32: Action 0,

UnityCommunicatorStoppedException: Communicator has exited.