In [None]:
import numpy as np

RENTAL_REVENUE = 10
MOVE_COST = 2
PARKING_COST = 4
MAX_BIKES = 20
MAX_MOVE = 5
DISCOUNT = 0.9

num_states = (MAX_BIKES + 1) * (MAX_BIKES + 1)

V = np.zeros((MAX_BIKES + 1, MAX_BIKES + 1))
policy = np.zeros((MAX_BIKES + 1, MAX_BIKES + 1))

def get_reward(state, action):
    loc1, loc2 = state
    rent_income = min(loc1, 3) * RENTAL_REVENUE + min(loc2, 4) * RENTAL_REVENUE
    move_cost = abs(action) * MOVE_COST
    parking_cost = PARKING_COST if loc1 > 10 else 0
    parking_cost += PARKING_COST if loc2 > 10 else 0
    return rent_income - move_cost - parking_cost

def policy_evaluation(policy):
    global V
    for i in range(MAX_BIKES + 1):
        for j in range(MAX_BIKES + 1):
            loc1, loc2 = i, j
            action = int(policy[loc1, loc2])
            next_loc1 = min(MAX_BIKES, max(0, loc1 - action + 3))
            next_loc2 = min(MAX_BIKES, max(0, loc2 + action + 2))
            V[loc1, loc2] = get_reward((loc1, loc2), action) + DISCOUNT * V[next_loc1, next_loc2]

def policy_improvement():
    policy_stable = True
    for i in range(MAX_BIKES + 1):
        for j in range(MAX_BIKES + 1):
            loc1, loc2 = i, j
            best_action = 0
            best_value = -float('inf')
            for action in range(-MAX_MOVE, MAX_MOVE + 1):
                if 0 <= loc1 - action <= MAX_BIKES and 0 <= loc2 + action <= MAX_BIKES:
                    next_loc1 = min(MAX_BIKES, max(0, loc1 - action + 3))
                    next_loc2 = min(MAX_BIKES, max(0, loc2 + action + 2))
                    value = get_reward((loc1, loc2), action) + DISCOUNT * V[next_loc1, next_loc2]
                    if value > best_value:
                        best_value = value
                        best_action = action
            if policy[loc1, loc2] != best_action:
                policy_stable = False
            policy[loc1, loc2] = best_action
    return policy_stable

def policy_iteration():
    global policy
    while True:
        policy_evaluation(policy)
        if policy_improvement():
            break
    return policy, V

optimal_policy, optimal_value = policy_iteration()

print("Optimal Policy (Bike Movement Between Locations):")
print(optimal_policy)
print("Optimal Value Function:")
print(optimal_value)


Optimal Policy (Bike Movement Between Locations):
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 2.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  1.  0.]
 [ 2.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 2.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 2.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  1.  0.]
 [ 2.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 2.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 2.  1.  1.  1.  0.  1.  0.  1.  0. -1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  0.]
 [ 2.  2.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 3.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0