In [1]:
import numpy as np 
from tqdm import tqdm 
# from environment import Enviroment

# Distribution

The standard deivation for Poisson distribution with any arbitrary $\lambda$ is $\sqrt{\lambda}$. 

In calculating the theoretical average of the distibution for policy evaluation, a practical range of number one should consider are values up to **3 to 4 standard deivations above the mean**. 

Therefore, the range one should consider can be expressed as $[0, \lambda + 3 \times \sqrt{\lambda}]$. Of course, the distribution can have values of larger than this range, but their probabilities are so small that it is inconsequential to the theoretical average of the policy evaluation step. 

# Environment

In [29]:
class Agent: 
    def __init__(self, cars_max: int, actions: list, starting_policy: list, states: list, rewards: list, theta: float, gamma: float): 
        # numbers of cars
        self.cars_max = cars_max

        # variables for reinforcement learing

        # policy contains index into the action array 
        self.policy = np.array(starting_policy) if type(starting_policy) != np.ndarray else starting_policy

        # contains the actual value of increasing / decreasing the cars of the 2 locations 
        self.actions = np.array(actions) if type(actions) != np.ndarray else actions

        # each state represents the number of cars in both locations
        self.states = np.array(states) if type(states) != np.ndarray else states 

        # type of rewards available
        self.rewards = rewards

        self.theta = theta 
        self.gamma = gamma

        self.state_policy_values = np.zeros_like(self.states) 

        # the available states are the starting number of cars of each location to moving all cars from 1 place to another 

    def get_reward(self, reward_index, number_of_cars: int): 
        return self.rewards[reward_index] * number_of_cars

    def p(self, s, action_index: tuple, today_rental_request_1: int, today_rental_request_2: int, today_customer_return_1, today_customer_return_2) -> tuple:
        # do not include return because returns are only effective till the next day and is therefore added to the number of cars at each location at the end of the loop

        # the probability of getting to the next state is 1 -> return next state

        # what if the next day request is the actual request, and we assume that we actually know the environment and how it works ? 

        # from the current state and selecting the action `action_index`, how does it go? 
        # cars1 = self.cars1 + self.actions[action_index]
        # cars2 = self.cars2 - self.actions[action_index]

        cars1 = s[0] 
        cars2 = s[1] 

        # calculate rewards based on today's number of cars
        number_of_cars_moved = self.actions[action_index]

        if cars1 < np.abs(number_of_cars_moved) or cars2 < np.abs(number_of_cars_moved): 
            return None

        cars1 +=  number_of_cars_moved
        cars2 +=  number_of_cars_moved
        
        cost = self.get_reward(1, number_of_cars=np.abs(self.actions[action_index]))

        final_reward = np.float32(0)

        reward1 = self.get_reward(0, min(today_rental_request_1, cars1)) # if request > cars -> rent all cars. else, rent 'request' numbers of cars
        reward2 = self.get_reward(0, min(today_rental_request_2, cars2)) # if request > cars -> rent all cars. else, rent 'request' numbers of cars

        final_reward += reward1
        final_reward += reward2
        final_reward += cost

        # calculate next state using the number of cars returned
        cars1 += today_customer_return_1
        cars2 += today_customer_return_2

        cars1 = min(cars1, self.cars_max)
        cars2 = min(cars2, self.cars_max)

        return final_reward, (cars1 - 1, cars2 - 1) 

    def get_action_from_policy(self, s: tuple) -> tuple: 
        """
        -> return the index into the action array 
        """
        return self.policy[s] 

    def set_action_to_policy(self, action_index: tuple, s: tuple) -> None: 
        """
        greedily set the new policy to the best action according to the value function 
        """
        self.policy[s] = action_index

    def check_valid_value_function_delta(self, delta) -> bool: 
        if delta < self.theta: 
            return True 
        else: 
            return False

    def policy_evaluation_step(self, today_rental_request_1, today_rental_request_2, today_customer_return_1, today_customer_return_2) -> np.float32: 
        delta = np.float32(0) 

        for i in range(self.state_policy_values.shape[0]): 
            for j in range(self.state_policy_values.shape[1]): 
                s = (i, j)
                v = self.state_policy_values[s]

                action_index = self.get_action_from_policy(s)

                # get reward and next state
                reward, next_state = self.p(s, action_index=action_index, today_rental_request_1=today_rental_request_1, today_rental_request_2=today_rental_request_2, today_customer_return_1=today_customer_return_1, today_customer_return_2=today_customer_return_2)

                # update current value function of current policy 
                self.state_policy_values[s] = reward + self.gamma * self.state_policy_values[next_state]

                # calculate difference delta
                delta = max(delta, np.abs(v - self.state_policy_values[s]))

        return delta 

    def policy_improvement_step(self, today_rental_request_1, today_rental_request_2, today_customer_return_1, today_customer_return_2) -> bool: 
        """
        -> returns True if the best policy is found, returns false if the policy is not found
        """

        for i in range(self.state_policy_values.shape[0]): 
            for j in range(self.state_policy_values.shape[1]): 
                s = (i, j)
                a = self.get_action_from_policy(s)

                # getting the best action for the current state based on the value function. argmax_a p(r, s' | s, a) 
                best_action_index = np.zeros(2) 
                best_action_value = np.float32(0)

                for x in range(self.actions.shape[0]): 
                    for y in range(self.actions.shape[1]): 
                        action_index = (x, y) 
                        reward, next_state = self.p(s=s, action_index=action_index, today_rental_request_1=today_rental_request_1, today_rental_request_2=today_rental_request_2, today_customer_return_1=today_customer_return_1, today_customer_return_2=today_customer_return_2)

                        current_action_value = reward + self.state_policy_values[next_state]

                        if current_action_value > best_action_value: 
                            best_action_value = current_action_value
                            best_action_index = action_index

                self.set_action_to_policy(best_action_index, s)

                if a != best_action_index: 
                    return False
                else: 
                    return True

In [28]:
class Enviroment:
    def __init__(self, expected_request_lambda_1: float, expected_request_lambda_2: float, expected_return_lambda_1: float, expected_return_lambda_2: float, agent: Agent):
        # initialize Poisson random variables
        self.expected_request_lambda_1 = expected_request_lambda_1
        self.expected_request_lambda_2 = expected_request_lambda_2

        self.expected_return_lambda_1 = expected_return_lambda_1
        self.expected_return_lambda_2 = expected_return_lambda_2

        # intiailize agent
        self.agent = agent

        # initialize policy history 
        self.policy_history = []

    def get_rental_requests(self) -> tuple:
        """
        -> returns a tuple of the number of rental requests at each location
        """
        rental_request_1 = np.random.poisson(
            lam=self.expected_request_lambda_1)
        rental_request_2 = np.random.poisson(
            lam=self.expected_request_lambda_2)

        return (rental_request_1, rental_request_2)

    def get_customer_returns(self) -> tuple:
        """
        -> returns a tuple of the number of cars returning at each location
        """
        customer_return_1 = np.random.poisson(
            lam=self.expected_return_lambda_1)
        customer_return_2 = np.random.poisson(
            lam=self.expected_return_lambda_2)

        return (customer_return_1, customer_return_2)

    def train(self, number_of_days, max_iterations) -> int: 
        """
        -> returns the number of iteration needed to find an optimal policy, or the number of max iterations if multiple policies are found
        """
        for iteration in tqdm(range(max_iterations), desc="Policy Check", total=max_iterations): 
            # save old policy and initialize benchmark variables
            benchmark_rental_request_1, benchmark_rental_request_2 = self.get_rental_requests()
            benchmark_customer_return_1, benchmark_customer_return_2 = self.get_customer_returns() 

            self.policy_history.append(self.agent.policy)

            # inner loop to evaluate value function
            for d in tqdm(range(number_of_days), desc="Policy Evaluation", total=number_of_days): 
                today_rental_request_1, today_rental_request_2 = self.get_rental_requests()
                today_customer_return_1, today_customer_return_2 = self.get_customer_returns()

                delta = self.agent.policy_evaluation_step(today_rental_request_1=today_rental_request_1, today_rental_request_2=today_rental_request_2, today_customer_return_1=today_customer_return_1, today_customer_return_2=today_customer_return_2)

                if self.agent.check_valid_value_function_delta(delta=delta): 
                    break 

            # check if current policy is optimal policy
            has_optimal_policy = self.agent.policy_improvement_step(today_rental_request_1=benchmark_rental_request_1, today_rental_request_2=benchmark_rental_request_2, today_customer_return_1=benchmark_customer_return_1, today_customer_return_2=benchmark_customer_return_2)

            if has_optimal_policy: 
                return iteration

        return max_iterations

In [2]:
expected_request_lambda_1 = 3
expected_request_lambda_2 = 4

expected_return_lambda_1 = 3 
expected_return_lambda_2 = 2

# -> at location 1, all cars rented are returned
# -> at location 2, only half the cars rented are expected to be returned

environment = Enviroment(expected_request_lambda_1=expected_request_lambda_1, expected_request_lambda_2=expected_request_lambda_2, expected_return_lambda_1=expected_return_lambda_1, expected_return_lambda_2=expected_return_lambda_2)

In [4]:
days = 20 

In [17]:
########## run this cell to see samples of how the environment generate rental requests and customer returns 

# for i in range(days): 
#     request1, request2 = environment.get_rental_requests()

#     return1, return2 = environment.get_customer_returns()

#     print(f"Location 1 actually has {request1} rental requests and receives {return1} cars that the customers returns || {return1 - request1} cars gained")
#     print(f"Location 2 actually has {request2} rental requests and receives {return2} cars that the customers returns || {return2 - request2} cars gained")

#     print(f"------------------------------------")

# Agent

In [18]:
max_number_of_cars = 3

In [22]:
available_actions = np.stack([np.arange(0, max_number_of_cars + 1), np.arange(0, -max_number_of_cars - 1, step=-1)])

In [24]:
available_actions

array([[ 0,  1,  2,  3],
       [ 0, -1, -2, -3]])

In [27]:
agent = Agent(cars_max=max_number_of_cars, actions=available_actions, starting_policy=np.zeros((3, 3)), states=np.zeros((3, 3)), rewards=[0, 10, -2], theta=1e-3, gamma=0.9)

# Combining Environment & Agent