In [1]:
import requests
import numpy as np
from gymnasium import spaces
import gymnasium as gym
from stable_baselines3 import DQN
from tenacity import retry, stop_after_attempt, wait_exponential
import logging
import time
from stable_baselines3.common.callbacks import BaseCallback
import os


In [2]:
# keep track of moves
obsers = []


# Set up the logger
log_filename = "maze_agent_run.log"  # Log file name

try:
    if os.path.exists(log_filename):
        os.remove(log_filename)
except:
    pass
# Set the logging level for urllib3 to WARNING
urllib3_logger = logging.getLogger("urllib3")
urllib3_logger.setLevel(logging.INFO)

logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Format for log messages
    handlers=[
        logging.FileHandler(log_filename),  # Log to a file
        #logging.StreamHandler()  # Log to the console
    ]
    
)

def get_info(response):
    if response.status_code != 200:
        print("Error code at response")
        return None, None, None, None, None

    # Retrieve JSON data from response
    data = response.json()

    # Use .get() method with default to None for each field
    done = data.get('done', None)
    info = data.get('info', None)
    observation = data.get('observation', None)
    reward = data.get('reward', None)
    trunc = data.get('trunc', None)

    # Convert observation to numpy array if it's not None
    if observation is not None:
        observation = np.array(observation, dtype=np.float32)
        obsers.append(observation)  # Assuming obsers is defined elsewhere
    else:
        observation = None

    return done, info, observation, reward, trunc
    
# implement retry policy
@retry(stop=stop_after_attempt(5),wait=wait_exponential(multiplier=1,min=4,max=10))
def make_request(url,headers,data=None):
    if data:
        response = requests.post(url,headers=headers,json=data)
    else:
        response = requests.post(url,headers=headers)
    #raise http error for bad responses
    response.raise_for_status()

    time.sleep(0.1)
    logging.info(f"Action:{url.split('/')[3]} data {data}, obs: {response.json().get('observation')},reward: {response.json().get('reward')} ")
    return response


In [5]:
class TrainingCallback(BaseCallback):
    def __init__(self, verbose=1):
        super(TrainingCallback, self).__init__(verbose)
    
    def _on_step(self) -> bool:
        # Log reward and step details
        if self.n_calls % 10 == 0:  # Log every 10th step
            logging.info(f"Step: {self.n_calls}, Reward: {self.locals['rewards']}")
        return True

    def _on_rollout_end(self) -> None:
        # Called at the end of each rollout (i.e., after each epoch)
        #logging.info(f"End of epoch. Total steps: {self.num_timesteps}")
        logging.info(f"Last rewards: {self.locals['rewards']}")
    



class MazeAPIEnv(gym.Env):
    def __init__(self, api_step_url, headers,api_reset):
        super(MazeAPIEnv, self).__init__()
        self.headers = headers = {'Content-Type': 'application/json'}
        self.api_step_url = api_step_url  # URL for the API step endpoint
        self.headers = headers  # Headers for authorization or any other required fields
        self.api_reset = api_reset


        # Define the action and observation space
        self.action_space = spaces.Discrete(4)
        
        
        # For multi-dimensional observations, use Box space
        self.observation_space = spaces.Box(low=0, high=4, shape=(2,), dtype=np.float32)  # Observation space

        # Normalization parameters
        self.obs_low = self.observation_space.low
        self.obs_high = self.observation_space.high
        self.obs_range = self.obs_high - self.obs_low
        
        self.current_state = np.array([0.0, 0.0], dtype=np.float32)  # Starting position
        self.done = False
        
    def normalize(self, observation):
        """ Normalize the observation to the range [0, 1]. """
        return (observation ) / 10 
        
    def reset(self,seed=None,**kwargs):
       
        response = make_request(url=self.api_reset, headers=self.headers)

        self.current_state = np.array([0.0, 0.0], dtype=np.float32)
        self.done = False
        
        return self.current_state, {}
        
    def step(self, action):
        # Send the action to the API
        content = {'action': int(action)}
        
        response = make_request(url=self.api_step_url, headers=self.headers, data=content)
    
        if response.status_code !=200 :
            print("error code in step")
        # Extract the response data
  
        done,info,raw_observation,reward,truncated = get_info(response)
        reward = reward +1
        self.current_state = self.normalize(np.array(raw_observation, dtype=np.float32))

        # Update current state
        self.done = done
        

        #logging.info(f"Current State = {self.current_state}")      
          
        return self.current_state, reward, done, {}
    


In [4]:
#*******************RANDOM AGENT******************************
# Define the API endpoint and headers
api_new_game = "http://18.185.60.20:5005/new_game"


headers = {'Content-Type': 'application/json'}

# Start new game
response = make_request(url = api_new_game, headers=headers)
uuid = response.json().get('uuid')

api_reset = "http://18.185.60.20:5005/reset/"+uuid
api_step_url = "http://18.185.60.20:5005/step/"+uuid

# Instantiate the custom environment
env = MazeAPIEnv(api_step_url, headers,api_reset)



obs, info = env.reset()
n_steps = 10
for _ in range(n_steps):
    # Random action
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated:
        obs, info = env.reset()

In [None]:
############# DEEP Q NETWORK#################

# Define the API endpoint and headers
api_new_game = "http://18.185.60.20:5005/new_game"

headers = {'Content-Type': 'application/json'}

# Start new game
response = make_request(url = api_new_game, headers=headers)
uuid = response.json().get('uuid')

api_reset = "http://18.185.60.20:5005/reset/"+uuid
api_step_url = "http://18.185.60.20:5005/step/"+uuid

# Instantiate the custom environment
env = MazeAPIEnv(api_step_url, headers,api_reset)

# Instantiate the DQN model with the custom policy
policy_kwargs = dict(
    net_arch=[128, 256, 128]  # Hidden layers
)

model = DQN(
    policy="MlpPolicy",
    env=env,
    policy_kwargs=policy_kwargs,
    verbose=2,
    exploration_fraction=0.2,
    exploration_final_eps=0.05
)
# Train the agent
logging.info("Starting the training process...")
callback = TrainingCallback()

# Train the model
model.learn(total_timesteps=5000,progress_bar=True,callback=callback)  # Adjust total_timesteps as needed


In [4]:
############### A2C ########################
from stable_baselines3 import A2C
# Define the API endpoint and headers
api_new_game = "http://18.185.60.20:5005/new_game"


headers = {'Content-Type': 'application/json'}

# Start new game
response = make_request(url = api_new_game, headers=headers)
uuid = response.json().get('uuid')

api_reset = "http://18.185.60.20:5005/reset/"+uuid
api_step_url = "http://18.185.60.20:5005/step/"+uuid

# Instantiate the custom environment
env = MazeAPIEnv(api_step_url, headers,api_reset)

logging.info("Starting training")

# Instantiate the A2C model
model = A2C('MlpPolicy', env, verbose=2)

# Train the model
model.learn(total_timesteps=5000,progress_bar=True)

# Save the model
model.save("a2c_maze_model")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Output()

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 4        |
|    iterations         | 100      |
|    time_elapsed       | 109      |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.036    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.394    |
|    value_loss         | 0.387    |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 4        |
|    iterations         | 200      |
|    time_elapsed       | 217      |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | -0.244   |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 0.0746   |
|    value_loss         | 0.407    |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 3        |
|    iterations         | 300      |
|    time_elapsed       | 408      |
|    total_timesteps    | 1500     |
| train/                |          |
|    entropy_loss       | -1.35    |
|    explained_variance | 0.212    |
|    learning_rate      | 0.0007   |
|    n_updates          | 299      |
|    policy_loss        | 1.58     |
|    value_loss         | 1.71     |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 3        |
|    iterations         | 400      |
|    time_elapsed       | 518      |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.3     |
|    explained_variance | -0.0314  |
|    learning_rate      | 0.0007   |
|    n_updates          | 399      |
|    policy_loss        | 0.45     |
|    value_loss         | 0.224    |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 3        |
|    iterations         | 500      |
|    time_elapsed       | 708      |
|    total_timesteps    | 2500     |
| train/                |          |
|    entropy_loss       | -1.24    |
|    explained_variance | -0.0431  |
|    learning_rate      | 0.0007   |
|    n_updates          | 499      |
|    policy_loss        | 2.18     |
|    value_loss         | 5.22     |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 100       |
|    ep_rew_mean        | 38.6      |
| time/                 |           |
|    fps                | 3         |
|    iterations         | 600       |
|    time_elapsed       | 817       |
|    total_timesteps    | 3000      |
| train/                |           |
|    entropy_loss       | -1.36     |
|    explained_variance | -5.47e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 599       |
|    policy_loss        | -0.0143   |
|    value_loss         | 0.0928    |
-------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 100       |
|    ep_rew_mean        | 38.6      |
| time/                 |           |
|    fps                | 3         |
|    iterations         | 700       |
|    time_elapsed       | 1008      |
|    total_timesteps    | 3500      |
| train/                |           |
|    entropy_loss       | -1.26     |
|    explained_variance | -4.17e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 699       |
|    policy_loss        | 1.81      |
|    value_loss         | 4.42      |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 3        |
|    iterations         | 800      |
|    time_elapsed       | 1116     |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.0113   |
|    learning_rate      | 0.0007   |
|    n_updates          | 799      |
|    policy_loss        | -0.682   |
|    value_loss         | 0.3      |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 3        |
|    iterations         | 900      |
|    time_elapsed       | 1307     |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.31    |
|    explained_variance | 0.000106 |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | 0.449    |
|    value_loss         | 0.352    |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 100      |
|    ep_rew_mean        | 38.6     |
| time/                 |          |
|    fps                | 3        |
|    iterations         | 1000     |
|    time_elapsed       | 1416     |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.13    |
|    explained_variance | 1.73e-06 |
|    learning_rate      | 0.0007   |
|    n_updates          | 999      |
|    policy_loss        | 1.61     |
|    value_loss         | 3.7      |
------------------------------------


--- Logging error ---
Traceback (most recent call last):
  File "c:\Users\DIMITRIS\AppData\Local\Programs\Python\Python310\lib\logging\__init__.py", line 1103, in emit
    stream.write(msg + self.terminator)
  File "c:\Users\DIMITRIS\AppData\Local\Programs\Python\Python310\lib\encodings\cp1253.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 689-752: character maps to <undefined>
Call stack:
  File "c:\Users\DIMITRIS\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\DIMITRIS\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\DIMITRIS\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\DIMITRIS\AppData\Roaming\Python

In [6]:
import numpy as np

from stable_baselines3 import DQN

def evaluate_policy(model, env: gym.Env, num_episodes: int = 10):
    """
    Evaluate the trained model policy on the given environment.

    :param model: Trained DQN model
    :param env: Environment instance
    :param num_episodes: Number of episodes to run for evaluation
    :return: Average reward per episode
    """
    total_rewards = []

    for episode in range(num_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            episode_reward += reward

        total_rewards.append(episode_reward)
        print(f"Episode {episode + 1}: Reward = {episode_reward}")

    avg_reward = np.mean(total_rewards)
    print(f"Average Reward over {num_episodes} episodes: {avg_reward}")
    return avg_reward


from stable_baselines3 import A2C
# Example usage:
if __name__ == "__main__":  
    
        
    # Define the API endpoint and headers
    api_new_game = "http://18.185.60.20:5005/new_game"

    headers = {'Content-Type': 'application/json'}

    # Start new game
    response = make_request(url = api_new_game, headers=headers)
    uuid = response.json().get('uuid')

    api_reset = "http://18.185.60.20:5005/reset/"+uuid
    api_step_url = "http://18.185.60.20:5005/step/"+uuid

    # Instantiate the custom environment
    env = MazeAPIEnv(api_step_url, headers,api_reset)


    # Instantiate the A2C model
    model = A2C('MlpPolicy', env, verbose=2)


    model = A2C.load('a2c_maze_model.zip')  # Load your trained model

    avg_reward = evaluate_policy(model, env, num_episodes=10)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Episode 1: Reward = 0.99


KeyboardInterrupt: 