In [1]:
import requests
import numpy as np
from gymnasium import spaces
import gymnasium as gym
from stable_baselines3 import PPO
from tenacity import retry, stop_after_attempt, wait_exponential
import logging
import time
from stable_baselines3.common.callbacks import BaseCallback
import os
import json

In [2]:
# keep track of moves
obsers = []

# Set up the logger
log_filename = "maze_agent_run.log"  # Log file name

try:
    if os.path.exists(log_filename):
        os.remove(log_filename)
except:
    pass
# Set the logging level for urllib3 to WARNING
urllib3_logger = logging.getLogger("urllib3")
urllib3_logger.setLevel(logging.INFO)

logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Format for log messages
    handlers=[
        logging.FileHandler(log_filename),  # Log to a file
        #logging.StreamHandler()  # Log to the console
    ]
    
)


def get_info(response):
    if response.status_code != 200:
        print("Error code at response")
        return None, None, None, None, None

    # Retrieve JSON data from response
    data = response.json()

    # Use .get() method with default to None for each field
    done = data.get('done', None)
    info = data.get('info', None)
    observation = data.get('observation', None)
    reward = data.get('reward', None)
    trunc = data.get('truncated', None)

    # Convert observation to numpy array if it's not None
    if observation is not None:
        observation = np.array(observation, dtype=np.float32)
        obsers.append(observation)  # Assuming obsers is defined elsewhere
    else:
        observation = None

    return done, info, observation, reward, trunc
    
# implement retry policy
@retry(stop=stop_after_attempt(5),wait=wait_exponential(multiplier=1,min=4,max=10))
def make_request(url,headers,data=None):
    if data:
        response = requests.post(url,headers=headers,json=data)
    else:
        response = requests.post(url,headers=headers)
    #raise http error for bad responses
    response.raise_for_status()

    time.sleep(0.1)
    logging.info(f"Action:{url.split('/')[3]} data {data}, obs: {response.json().get('observation')},reward: {response.json().get('reward')} ")
    return response



class TrainingCallback(BaseCallback):
    def __init__(self, verbose=1):
        super(TrainingCallback, self).__init__(verbose)
        self.episode_steps = 0
        self.episode_rewards = 0
        self.episode_lengths = []
        self.episode_rewards_list = []
    
    def _on_step(self) -> bool:
        # Increment step and reward counters for the current episode
        self.episode_steps += 1
        self.episode_rewards += self.locals['rewards']# Assuming rewards is a list

        # Check if the episode is done
        if self.locals['dones'][0]:
            # Log the number of steps and reward for the episode
            self.episode_lengths.append(self.episode_steps)
            self.episode_rewards_list.append(self.episode_rewards)

            logging.info(f"Episode finished - Steps: {self.episode_steps}, Reward: {self.episode_rewards}")

            # Reset for the next episode
            self.episode_steps = 0
            self.episode_rewards = 0
        

        return True

    def _on_rollout_end(self) -> None:
        total_episodes = len(self.episode_lengths)
        if total_episodes > 0:
            avg_episode_length = sum(self.episode_lengths) / total_episodes
            avg_reward_per_episode = sum(self.episode_rewards_list) / total_episodes
            logging.info(f"End of rollout. Total episodes: {total_episodes}, "
                         f"Average episode length: {avg_episode_length}, "
                         f"Average reward per episode: {avg_reward_per_episode}")


class MazeAPIEnv(gym.Env):
    def __init__(self, api_step_url, headers,api_reset):
        super(MazeAPIEnv, self).__init__()
        self.headers = headers = {'Content-Type': 'application/json'}
        self.api_step_url = api_step_url  # URL for the API step endpoint
        self.headers = headers  # Headers for authorization or any other required fields
        self.api_reset = api_reset


        # Define the action and observation space
        self.action_space = spaces.Discrete(4)
        
        
        # For multi-dimensional observations, use Box space
        self.observation_space = spaces.Box(low=0, high=4, shape=(2,), dtype=np.float32)  # Observation space

        self.current_state = np.array([0.0, 0.0], dtype=np.float32)  # Starting position
        self.done = False
        
        #array to help exploration
        self.visited = set()
        self.min_reward = -1
        self.max_reward = -0.01
        
    def normalize(self, observation):
        """ Normalize the observation to the range [0, 1]. """
        return (observation ) / 10 
    
    def normalize_reward(self,reward):
        return (2*(reward-self.min_reward)/(self.max_reward-self.min_reward))-1
            
    def reset(self,seed=None,**kwargs):
       
        response = make_request(url=self.api_reset, headers=self.headers)

        self.current_state = np.array([0.0, 0.0], dtype=np.float32)
        self.done = False
        self.visited = set()
        
        return self.current_state, {}
        
    def step(self, action):
        # Send the action to the API
        content = {'action': int(action)}
        
        response = make_request(url=self.api_step_url, headers=self.headers, data=content)
    
        if response.status_code !=200 :
            print("error code in step")
        
        # Extract the response data
        done,info,raw_observation,reward,truncated = get_info(response)
        reward = reward+1
        
        
        if tuple(raw_observation) not in self.visited:
            self.visited.add(tuple(raw_observation))
        
        self.current_state = self.normalize(np.array(raw_observation, dtype=np.float32))
        reward = self.normalize_reward(reward)
        # Update current state
        self.done = done
        

        #logging.info(f"Current State = {self.current_state}")      
          
        return self.current_state, reward, done,truncated, {}
    


In [None]:
####################### PPO MODEL ########################
# Define the API endpoint and headers

# Import base url from env.json file  
data = open('env.json')    
base_url = json.load(data)['url']

api_new_game = base_url+"/new_game"

headers = {'Content-Type': 'application/json'}

# Start new game
response = make_request(url = api_new_game, headers=headers)
uuid = response.json().get('uuid')

api_reset = base_url+"/reset/"+uuid
api_step_url = base_url+"/step/"+uuid

# Instantiate the custom environment
env = MazeAPIEnv(api_step_url, headers,api_reset)


# Create the PPO model
model = PPO('MlpPolicy', env, 
            verbose=2,
            tensorboard_log="C:\\Users\\DIMITRIS\\Reinforcement_Learning\\.ppo_tensorboard",
            learning_rate=1e-3,
            n_steps=1024,
            ent_coef = 0.03
            )
# Train the model

callback = TrainingCallback()

model.learn(total_timesteps=5000,progress_bar=True,callback=callback)  # Adjust the number of timesteps as needed


model.save('ppo_custom_env-test')

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy


# Example usage:
if __name__ == "__main__":
    # Define the API endpoint and headers
    api_new_game = "http://18.185.60.20:5005/new_game"

    headers = {'Content-Type': 'application/json'}

    # Start new game
    response = make_request(url = api_new_game, headers=headers)
    uuid = response.json().get('uuid')

    api_reset = "http://18.185.60.20:5005/reset/"+uuid
    api_step_url = "http://18.185.60.20:5005/step/"+uuid

    # Instantiate the custom environment
    env = MazeAPIEnv(api_step_url, headers,api_reset)

   
    # Replace with your environment and model paths
    #model = PPO.load('ppo_custom_env.zip')  # Load your trained model

    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)
    print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")


In [None]:
# Define the API endpoint and headers
api_new_game = "http://18.185.60.20:5005/new_game"

headers = {'Content-Type': 'application/json'}

# Start new game
response = make_request(url = api_new_game, headers=headers)
uuid = response.json().get('uuid')

api_reset = "http://18.185.60.20:5005/reset/"+uuid
api_step_url = "http://18.185.60.20:5005/step/"+uuid

# Instantiate the custom environment
env = MazeAPIEnv(api_step_url, headers,api_reset)


# Replace with your environment and model paths
model = PPO.load('ppo_custom_env5.zip')  # Load your trained model



total_rewards = []

obs, _ = env.reset()
done = False
episode_reward = 0

while not done or truncated:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, truncated,info = env.step(action)
    episode_reward += reward

total_rewards.append(episode_reward)
print(f" Reward = {episode_reward}")

avg_reward = np.mean(total_rewards)

