In [3]:
import requests
import numpy as np
from gymnasium import spaces
import gymnasium as gym
from stable_baselines3 import PPO
from tenacity import retry, stop_after_attempt, wait_exponential
import logging
import time
from stable_baselines3.common.callbacks import BaseCallback
import os


In [4]:
# keep track of moves
obsers = []


# Set up the logger
log_filename = "maze_agent_run.log"  # Log file name

try:
    if os.path.exists(log_filename):
        os.remove(log_filename)
except:
    pass
# Set the logging level for urllib3 to WARNING
urllib3_logger = logging.getLogger("urllib3")
urllib3_logger.setLevel(logging.INFO)

logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Format for log messages
    handlers=[
        logging.FileHandler(log_filename),  # Log to a file
        #logging.StreamHandler()  # Log to the console
    ]
    
)

def get_info(response):
    if response.status_code != 200:
        print("Error code at response")
        return None, None, None, None, None

    # Retrieve JSON data from response
    data = response.json()

    # Use .get() method with default to None for each field
    done = data.get('done', None)
    info = data.get('info', None)
    observation = data.get('observation', None)
    reward = data.get('reward', None)
    trunc = data.get('trunc', None)

    # Convert observation to numpy array if it's not None
    if observation is not None:
        observation = np.array(observation, dtype=np.float32)
        obsers.append(observation)  # Assuming obsers is defined elsewhere
    else:
        observation = None

    return done, info, observation, reward, trunc
    
# implement retry policy
@retry(stop=stop_after_attempt(5),wait=wait_exponential(multiplier=1,min=4,max=10))
def make_request(url,headers,data=None):
    if data:
        response = requests.post(url,headers=headers,json=data)
    else:
        response = requests.post(url,headers=headers)
    #raise http error for bad responses
    response.raise_for_status()

    time.sleep(0.1)
    logging.info(f"Action:{url.split('/')[3]} data {data}, obs: {response.json().get('observation')},reward: {response.json().get('reward')} ")
    return response


class TrainingCallback(BaseCallback):
    def __init__(self, verbose=1):
        super(TrainingCallback, self).__init__(verbose)
    
    def _on_step(self) -> bool:
        # Log reward and step details
        if self.n_calls % 10 == 0:  # Log every 10th step
            logging.info(f"Step: {self.n_calls}, Reward: {self.locals['rewards']}")
        return True

    def _on_rollout_end(self) -> None:
        # Called at the end of each rollout (i.e., after each epoch)
        #logging.info(f"End of epoch. Total steps: {self.num_timesteps}")
        logging.info(f"Last rewards: {self.locals['rewards']}")
    



class MazeAPIEnv(gym.Env):
    def __init__(self, api_step_url, headers,api_reset):
        super(MazeAPIEnv, self).__init__()
        self.headers = headers = {'Content-Type': 'application/json'}
        self.api_step_url = api_step_url  # URL for the API step endpoint
        self.headers = headers  # Headers for authorization or any other required fields
        self.api_reset = api_reset


        # Define the action and observation space
        self.action_space = spaces.Discrete(4)
        
        
        # For multi-dimensional observations, use Box space
        self.observation_space = spaces.Box(low=0, high=4, shape=(2,), dtype=np.float32)  # Observation space

        # Normalization parameters
        self.obs_low = self.observation_space.low
        self.obs_high = self.observation_space.high
        self.obs_range = self.obs_high - self.obs_low
        
        self.current_state = np.array([0.0, 0.0], dtype=np.float32)  # Starting position
        self.done = False
        
    def normalize(self, observation):
        """ Normalize the observation to the range [0, 1]. """
        return (observation ) / 10 
        
    def reset(self,seed=None,**kwargs):
       
        response = make_request(url=self.api_reset, headers=self.headers)

        self.current_state = np.array([0.0, 0.0], dtype=np.float32)
        self.done = False
        
        return self.current_state, {}
        
    def step(self, action):
        # Send the action to the API
        content = {'action': int(action)}
        
        response = make_request(url=self.api_step_url, headers=self.headers, data=content)
    
        if response.status_code !=200 :
            print("error code in step")
        # Extract the response data
  
        done,info,raw_observation,reward,truncated = get_info(response)
        reward = reward +1
        self.current_state = self.normalize(np.array(raw_observation, dtype=np.float32))

        # Update current state
        self.done = done
        

        #logging.info(f"Current State = {self.current_state}")      
          
        return self.current_state, reward, done, {}
    


In [None]:
####################### PPO MODEL ########################
# Define the API endpoint and headers
api_new_game = "http://18.185.60.20:5005/new_game"

headers = {'Content-Type': 'application/json'}

# Start new game
response = make_request(url = api_new_game, headers=headers)
uuid = response.json().get('uuid')

api_reset = "http://18.185.60.20:5005/reset/"+uuid
api_step_url = "http://18.185.60.20:5005/step/"+uuid

# Instantiate the custom environment
env = MazeAPIEnv(api_step_url, headers,api_reset)

# Create the PPO model
model = PPO('MlpPolicy', env, verbose=1)

# Train the model
model.learn(total_timesteps=10000)  # Adjust the number of timesteps as needed

# Save the model
model.save('ppo_custom_env')