<a href="https://colab.research.google.com/github/GarettGazay/ai_projects/blob/master/OracleRL_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [196]:
!pip install gym
!pip install stable_baselines3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [197]:
import pandas as pd
import numpy as np

In [198]:
data = pd.read_csv('/content/training_data.csv')

In [199]:
data.head(15)

Unnamed: 0,appt_hour,appt_minute,asset_id,dropoff_lat,dropoff_lng,load_order,multi_load_after,multi_load_before,pickup_hour,pickup_lat,pickup_lng,pickup_minute,rider_id,schedule_order,space_type
0,5,0,0,-121.848076,37.360374,1,0,0,4,-121.818245,37.319942,15,43,0,5
1,5,45,0,-121.93148,37.311272,0,0,0,5,-121.891289,37.314491,0,25,1,5
2,6,15,0,-121.933708,37.315559,0,0,0,5,-121.964226,37.292717,30,19,2,5
3,7,45,0,-122.20301,37.485722,0,0,0,6,-122.018349,37.35318,45,46,3,5
4,9,15,0,-122.093407,37.398567,0,0,0,8,-122.163383,37.477654,15,24,4,5
5,10,30,0,-121.891464,37.245495,0,0,0,9,-121.950905,37.334362,30,21,5,5
6,0,0,0,-121.964226,37.292717,0,0,0,9,-121.933708,37.315559,45,19,6,5
7,0,0,0,-121.961914,37.275314,0,0,0,10,-121.891701,37.245491,30,10,7,1
8,5,45,1,-122.079163,37.382942,0,0,0,4,-122.026657,37.321224,45,2,8,5
9,6,45,1,-121.891701,37.245491,0,0,0,5,-121.961914,37.275314,45,10,9,1


In [200]:
import gym
from stable_baselines3 import PPO

In [201]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np

class OracleEnv(Env):
    metadata = {'render.modes': ['human']}
    def __init__(self, 
        data,
        schedule_order,
        space_type_encoder,
        rider_id_encoder,
        asset_id_encoder,
        num_samples):

        self.data = data
        self.schedule_order = schedule_order
        self.space_type_encoder = space_type_encoder # to decode after prediction
        self.rider_id_encoder = rider_id_encoder # to decode after prediction
        self.asset_id_encoder = asset_id_encoder # to decode after prediction

        self.state = None
        self.selected_index = 0 # zero at start
        self.selected_ride = None
        self.num_features = 15
        self.num_samples = num_samples
        self.last_obs = None

        self.schedule_position_indexer = 0 # represents the ride-schedule positional relationship in time
        # that we want the agent to predict for, increments each time an action is taken, 
        # if the agent does not select the ride with the corresponding load order that matches this number,
        # it will end in a terminal state where the agent will get a negative reward. 


        self.action_space = Discrete(3)
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(self.num_features,))

    def step(self, action):
        info = {}

        # Apply action
        if action == 0 and self.selected_index > 0: # move up
            self.selected_index -= 1
            self.state = self.data[self.selected_index]

        elif action == 1 and self.selected_index <= self.num_samples - 1: # move down ( num_samples -1 to avoid the agent moving outside the bottom of the data list)
            self.selected_index += 1
            self.state = self.data[self.selected_index]

        elif action == 2: # select current ride
            self.state = self.data[self.selected_index] # select the state using agent's selected index
        else:
            self.selected_index = 0 # Agent selects the starting point state if it tries to move up at starting point which would end in a negative number which is invalid.
        
        # Calculate Reward
        if self.state[1] == self.schedule_position_indexer: # if the agent selected the correct ride for the SPI

            reward = 1 
            done = False # Good job agent, you may continue the game.
            # print(f'Agent selected the correct ride for the SPI - Agent selection index: {self.selected_index} == SPI: {self.schedule_position_indexer} ')
        else: 
            reward = -1 # if the agent selects the wrong index to SPI relationship.
            done = False # Terminal state for failing to order correctly - The agent must put every ride in the correct order or the schedeule will have not been made correctly and the hard and fast goal is to get the agent to learn how to arrange a schedule like the human did.
        
        # Check if ordering is done
        if self.schedule_position_indexer == self.num_samples: 
            done = True
            reward = 100 # if the agent gets to the end it means it has selected the correct schedule so it gets a fatty reward.
        
        # Return step information
        self.schedule_position_indexer += 1

        # Mask features to help the agent learn from the critical features.
        masked_observation = np.copy(self.state) # ensure that the original observation is not modified when masking the feature.
        masked_observation[13] = -1 # Mask the third feature of the observation by setting its value to -1
        print(masked_observation)
        return masked_observation, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = self.data[0] # index zero of the observation space
        self.selected_index = 0
        self.schedule_position_indexer = 0 # start iteration from the beginning
        return self.state

    def last_obs(self):
      return self.last_obs


In [202]:
 # Label encode 
from sklearn.preprocessing import LabelEncoder

space_type_encoder = LabelEncoder() 
space_type_encoder.fit_transform(data['space_type'])

rider_id_encoder = LabelEncoder()
rider_id_encoder.fit_transform(data['rider_id'])

asset_id_encoder = LabelEncoder()
asset_id_encoder.fit_transform(data['asset_id'])

schedule_order = [x for x in data['schedule_order']]
num_samples = len(data)

In [203]:
# Create the environment
env = OracleEnv(
    data.values,
    schedule_order,
    space_type_encoder,
    rider_id_encoder,
    asset_id_encoder,
    num_samples)

In [204]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

env = DummyVecEnv([lambda: env])  # Create a vectorized environment

model = PPO('MlpPolicy', env, verbose=1)  # Create a PPO model

model.learn(total_timesteps=100000)  # Train the model for 10000 timesteps




Using cuda device
-----------------------------
| time/              |      |
|    fps             | 594  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 553          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0015319467 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | -0.00122     |
|    learning_rate        | 0.0003       |
|    loss                 | 189          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000635    |
|    value_loss           | 774          |
-----------------------------------------

IndexError: ignored

In [205]:
# Assume you have a trained model called 'model' and a list of rides called 'rides'

# Create a function to preprocess each observation (vector of data that contains important samples about the ride)
def preprocess_observation(observation):
    # Preprocess the observation as necessary (e.g., normalize, scale, one-hot encode, etc.)
    preprocessed_observation = observation # placeholder
    return preprocessed_observation

# Create a list to hold the predicted actions for each observation
predicted_actions = []

# Iterate over each observation (vector of data that contains important samples about the ride)

for observation in data.values:
    # Preprocess the observation
    preprocessed_observation = preprocess_observation(observation)

    # Predict the action to take for this observation using the trained model
    action, _ = model.predict(preprocessed_observation, deterministic=True)

    # Add the predicted action to the list of predicted actions
    predicted_actions.append(action)

# Use the predicted actions to determine the order of the rides
ordered_rides = [data.values[i] for i in np.argsort(predicted_actions)]

ordered_rides = pd.DataFrame(ordered_rides, columns=['appt_hour', 'appt_minute', 'asset_id', 'dropoff_lat', 'dropoff_lng', 'load_order', 'multi_load_after', 'multi_load_before', 'pickup_hour', 'pickup_lat', 'pickup_lng', 'pickup_minute', 'rider_id', 'schedule_order', 'space_type'])
ordered_rides.head(15)


Unnamed: 0,appt_hour,appt_minute,asset_id,dropoff_lat,dropoff_lng,load_order,multi_load_after,multi_load_before,pickup_hour,pickup_lat,pickup_lng,pickup_minute,rider_id,schedule_order,space_type
0,5.0,0.0,0.0,-121.848076,37.360374,1.0,0.0,0.0,4.0,-121.818245,37.319942,15.0,43.0,0.0,5.0
1,0.0,0.0,7.0,-122.021889,37.36327,0.0,0.0,0.0,13.0,-122.014046,37.376621,15.0,18.0,69.0,4.0
2,0.0,0.0,7.0,-121.925591,37.312283,0.0,0.0,0.0,12.0,-121.931572,37.311279,30.0,9.0,68.0,5.0
3,12.0,30.0,7.0,-121.902969,37.378452,0.0,0.0,0.0,11.0,-121.967812,37.322025,30.0,29.0,67.0,4.0
4,10.0,15.0,7.0,-122.014046,37.376621,0.0,0.0,0.0,9.0,-122.021889,37.36327,30.0,18.0,66.0,4.0
5,10.0,0.0,7.0,-122.014107,37.376469,0.0,0.0,0.0,9.0,-122.022224,37.363358,30.0,33.0,65.0,5.0
6,9.0,0.0,7.0,-121.826912,37.251938,0.0,0.0,0.0,8.0,-121.811661,37.262196,15.0,14.0,64.0,0.0
7,6.0,15.0,7.0,-121.846095,37.308179,0.0,0.0,0.0,6.0,-121.832367,37.283016,0.0,45.0,63.0,5.0
8,6.0,0.0,7.0,-121.812096,37.306049,0.0,0.0,0.0,5.0,-121.849152,37.326744,30.0,7.0,62.0,5.0
9,6.0,0.0,7.0,-121.811386,37.305172,0.0,0.0,0.0,5.0,-121.835358,37.337029,15.0,26.0,61.0,5.0


In [None]:
# Evaluate the model and visualize its decisions
obs = env.reset()
for i in range(1000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()
env.close()