In [2]:
import numpy as np 
from tqdm import tqdm 
from environment import Enviroment

# Value Table

In [3]:
value_function = np.load("./value_function.npy")

In [4]:
value_function

array([[470.00565273, 480.00565273, 490.00565273],
       [480.00565273, 490.00565273, 500.00565273],
       [480.00565273, 490.00565273, 500.00565273]])

# Actions

In [5]:
actions = np.load("./actions.npy") 

# Policy visualization

In [6]:
policy_history = np.load("./history.npy")

In [7]:
optimal_policy = policy_history[-1]

In [8]:
print(optimal_policy) 

[[[0 0]
  [0 0]
  [0 0]]

 [[0 0]
  [0 0]
  [0 0]]

 [[0 0]
  [0 0]
  [0 0]]]


# Examine the efficiency of each policy 

In [9]:
class Tester: 
    def __init__(self, cars1, cars2, cars_max, policy, actions): 
        self.cars1 = cars1
        self.cars2 = cars2 
        self.cars_max = cars_max
        self.policy = policy
        self.actions = actions

    def update_cars(self, new_car_1, new_car_2): 
        self.cars1 = new_car_1
        self.cars2 = new_car_2

    def get_cars(self): 
        return (self.cars1, self.cars2)

    def get_action_index(self, s): 
        action_index = self.policy[s]
        action_index = tuple(action_index)
        return action_index

    def get_action(self, action_index): 
        return self.actions[action_index] 

In [10]:
constants = np.load("./constants.npy")
number_of_days = 2000000

In [11]:
rewards = np.load("./rewards.npy")
rewards

array([10, -2])

In [12]:
actions

array([[ 0,  1,  2,  3],
       [ 0, -1, -2, -3]])

In [13]:
max_number_of_cars, expected_request_lambda_1, expected_request_lambda_2, expected_return_lambda_1, expected_return_lambda_2 = constants

In [14]:
environment = Enviroment(expected_request_lambda_1=expected_request_lambda_1, expected_request_lambda_2=expected_request_lambda_2,
                             expected_return_lambda_1=expected_return_lambda_1, expected_return_lambda_2=expected_return_lambda_2, agent=None)

In [15]:
cars1 = 2
cars2 = 1

tester_list = [] 

for policy in policy_history: 
    tester = Tester(cars1, cars2, max_number_of_cars, policy=policy, actions=actions)

    tester_list.append(tester) 

In [18]:
reward_list = []
for i, tester in enumerate(tester_list): 
    current_reward = 0
    for d in tqdm(range(number_of_days), desc=f"Simulating sales for tester {i}"): 
        rental_request_1, rental_request_2 = environment.get_rental_requests()
        customer_return_1, customer_return_2 = environment.get_customer_returns()

        # current number of cars 
        cars1, cars2 = tester.get_cars()

        # renting out the cars
        cars_rented_1 = min(cars1, rental_request_1)
        cars_rented_2 = min(cars2, rental_request_2)

        # calculate new car number for tomorrow
        cars1 -= cars_rented_1
        cars2 -= cars_rented_2

        cars1 += customer_return_1
        cars2 += customer_return_2

        # set cars to not exceed limit
        cars1 = min(cars1, tester.cars_max)
        cars2 = min(cars2, tester.cars_max)

        # calculate reward for cars rented
        current_reward += rewards[0] * cars_rented_1
        current_reward += rewards[0] * cars_rented_2

        # select the best action based on the current policy and current state
        action_index = tester.get_action_index((cars1 - 1, cars2 - 1))
        number_of_cars_moved = tester.get_action(action_index) 

        if number_of_cars_moved != 0: 
            print(f"number of cars moved: {number_of_cars_moved}")

        # cost of moving cars
        cost = rewards[1] * np.abs(number_of_cars_moved)
        current_reward += cost

        # update the cars to the number of cars moved
        cars1 += number_of_cars_moved
        cars2 -= number_of_cars_moved

        # set cars to not exceed the limit 
        cars1 = min(cars1, tester.cars_max)
        cars2 = min(cars2, tester.cars_max)
    
    reward_list.append(current_reward)

reward_list = np.array(reward_list) 
best_policy = np.argmax(reward_list) 
print(f"best policy from simulation: {best_policy} || best policy from agent: {len(policy_history) - 1}")

Simulating sales for tester 0: 100%|██████████| 10000000/10000000 [01:11<00:00, 138929.24it/s]
Simulating sales for tester 1: 100%|██████████| 10000000/10000000 [01:11<00:00, 139117.31it/s]
Simulating sales for tester 2: 100%|██████████| 10000000/10000000 [01:12<00:00, 137457.68it/s]
Simulating sales for tester 3: 100%|██████████| 10000000/10000000 [01:12<00:00, 138627.86it/s]

best policy from simulation: 2 || best policy from agent: 3



