# Policy Iteration

In [None]:
S = ['canteen', 'hostel', 'academic building'] # Set of All States
A = ['attend classes', 'hungry'] # Set of All Actions
R = {'canteen': 1, 'hostel': -1, 'academic building': 3} # Reward Function
P = {'hostel': {'attend classes': {'canteen': 0, 'hostel': 0.5, 'academic building': 0.5},
                'hungry': {'canteen': 1, 'hostel': 0, 'academic building': 0}},
     'academic building': {'attend classes': {'canteen': 0.3, 'hostel': 0, 'academic building': 0.7},
                            'hungry': {'canteen': 0.8, 'hostel': 0, 'academic building': 0.2}},
     'canteen': {'attend classes': {'canteen': 0.1, 'hostel': 0.3, 'academic building': 0.6},
                 'hungry': {'canteen': 1, 'hostel': 0, 'academic building': 0}}
     } # Probability Transition Function

In [None]:
def policy_iteration(S, A, R, P):

  policy = {s: A[1] for s in S}

  while True:
    old_policy = policy.copy()

    V = policy_elvaluation(policy)

    policy = policy_improvement(V)
    if old_policy == policy:
      break

  return policy, V

In [None]:
def policy_elvaluation(policy, gamma = 0.9, delta = 0.1):

  V = {s: 0 for s in S}

  while (True):
    old_V = V.copy()

    for s in S:
      a = policy[s]
      V[s] = R[s] + gamma*sum(P[s][a][s_next]*old_V[s_next] for s_next in S)

    if max(abs(V[s] - old_V[s]) for s in S) < delta:
      break
  return V

In [None]:
def policy_improvement(V, gamma = 0.9):
  policy = {s: A[0] for s in S}

  for s in S:
    Q = {}
    for a in A:
      Q[a] =  R[s] + gamma*sum(P[s][a][s_next]*V[s_next] for s_next in S)

    policy[s] = max(Q, key = Q.get)

  return policy

In [None]:
optimal_policy, optimal_value_function = policy_iteration(S, A, R, P)
print('Optimal Policy', optimal_policy)
print('Optimal Value Function', optimal_value_function)

Optimal Policy {'canteen': 'attend classes', 'hostel': 'attend classes', 'academic building': 'attend classes'}
Optimal Value Function {'canteen': 17.96045319629523, 'hostel': 15.189986372635445, 'academic building': 20.980262034084397}


# Value Iteration

In [None]:
def value_iteration(S, A, R, P):

  V = {s: 0 for s in S}
  policy = {s: A[1] for s in S}
  gamma = 0.9
  delta = 0.1

  while True:
    old_V = V.copy()
    old_policy = policy.copy()
    for s in S:
      Q = {}
      for a in A:
        Q[a] = R[s] + gamma*sum(P[s][a][s_next]*old_V[s_next] for s_next in S)

      V[s] = max(Q.values())
      policy[s] = max(Q, key = Q.get)

    if max(abs(V[s] - old_V[s]) for s in S) < delta:  #break by value function
      break
    # if old_policy == policy: # break by policy
    #   break


  return policy, V


In [None]:
optimal_policy, optimal_value_function = value_iteration(S, A, R, P)
print('Optimal Policy', optimal_policy)
print('Optimal Value Function', optimal_value_function)

Optimal Policy {'canteen': 'attend classes', 'hostel': 'attend classes', 'academic building': 'attend classes'}
Optimal Value Function {'canteen': 17.96045319629523, 'hostel': 15.189986372635445, 'academic building': 20.980262034084397}
