# Test

In [1]:
import gym 
import numpy as np
env = gym.make('FrozenLake-v0')

In [2]:
def policy_iteration(env, n_iter=20000, gamma=1.0):
    policy = np.zeros(env.env.nS)
    for i in range(n_iter):
        new_value_function = compute_value_function(env, policy, gamma)
        new_policy = extract_policy(env, new_value_function, gamma)
        if (np.all(new_policy==policy)):
            print('Iteracija politika je konvergirala u {} koraku'.format(i+1))
            break
        policy = new_policy
    return new_policy

In [3]:
def compute_value_function(env, policy, gamma, threshold=1e-10):
    value = np.zeros(env.env.nS)
    condition = True
    while condition:
        new_value = np.copy(value)
        for state in range(env.env.nS):
            action = policy[state]
            value[state] = sum([trans_prob * (reward_prob + gamma * new_value[next_state])
                               for trans_prob, next_state, reward_prob, _ in env.env.P[state][action]])
        condition =  np.sum(np.fabs(new_value - value))>threshold
    return value        

In [4]:
def extract_policy(env, value, gamma):
    policy = np.zeros(env.env.nS)
    for state in range(env.env.nS):
        Q_table = np.zeros(env.env.nA)
        for action in range(env.env.nA):
            Q_table[action] = sum([
                trans_prob * (reward_prob + gamma * value[next_state])
                for trans_prob, next_state, reward_prob, _ in env.env.P[state][action]
            ])
        policy[state] = np.argmax(Q_table)
    return policy

In [5]:
print(policy_iteration(env))

Iteracija politika je konvergirala u 7 koraku
[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [6]:
import gym
import numpy as np
env = gym.make('FrozenLake-v0')

In [7]:
def value_iteration(env, gamma=1.0, n_iter=100000, threshold=1e-20):
    value = np.zeros(env.env.nS)
    for i in range(n_iter):
        new_value = np.copy(value)
        for state in range(env.env.nS):
            Q_value = []
            for action in range(env.env.nA):
                next_states_rewards = []
                next_states_rewards.append([
                    trans_prob * (reward_prob + gamma * new_value[next_state])
                    for trans_prob, next_state, reward_prob, _ in env.env.P[state][action]
                ])
                Q_value.append(np.sum(next_states_rewards))
            value[state] = max(Q_value)
        if np.sum(np.fabs(value-new_value))<=threshold:
            print('Algoritam je konvergirao nakon {} iteracija'.format(i+1))
            break
    return value

In [8]:
optimal_value_function = value_iteration(env)

Algoritam je konvergirao nakon 1373 iteracija


# Bellman Equation
$V(s) = \max\limits_a(R(s,a) + \gamma\sum\limits_{s'}T(s,a,s')V(s'))$

## Value iteration

In [9]:
import gym 
import numpy as np
env = gym.make('FrozenLake-v0')

In [10]:
def value_iteration(env, gamma=1.0, n_iter=100000, threshold=1e-20):
    value = np.zeros(env.env.nS)
    for i in range(n_iter):
        new_value = np.copy(value)
        for state in range(env.env.nS):
            Q_value = []
            for action in range(env.env.nA):
                next_states_rewards = []
                next_states_rewards.append([
                    trans_prob * (reward_prob + gamma * new_value[next_state])
                    for trans_prob, next_state, reward_prob, _ in env.env.P[state][action]
                ])
                Q_value.append(np.sum(next_states_rewards))
            value[state] = max(Q_value)
        if np.sum(np.fabs(value-new_value))<=threshold:
            print('Algoritam je konvergirao nakon {} iteracija'.format(i+1))
            break
    return value            

## Policy iteration

In [11]:
def policy_iteration(env, n_iter=20000, gamma=1.0, threshold=1e-10):
    policy = np.zeros(env.env.nS)
    for i in range(n_iter):
        new_value_function = compute_value_function(env, policy, gamma,threshold )

# Taxi-v1

In [12]:
import random
import gym

In [13]:
env = gym.make('Taxi-v2')
alpha = 0.4
gamma = 0.999
epsilon = 0.017
q_table = {}
for s in range(env.env.nS):
    for a in range(env.env.nA):
        q_table[(s,a)] = 0

In [14]:
def update_q_table(prev_state, action, reward, next_state, alpha, gamma):
    qa = max([
        q_table[(next_state, a)] for a in range(env.env.nA)
    ])
    q_table[(prev_state, action)] += alpha * (reward + gamma * qa - q_table[(prev_state, action)])

In [15]:
def epsilon_greedy_policy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.env.nA)), key = lambda x: q_table[(state, x)])

In [17]:
env.render()
for i in range(8000):
    r = 0
    prev_state = env.reset()
    
    while True:
                
        action = epsilon_greedy_policy(prev_state, epsilon)
        
        next_state, reward, done, _ = env.step(action)
        
        update_q_table(prev_state, action, reward, next_state, alpha, gamma)
        
        prev_state = next_state
        
        r += reward
        
        if done:
            break
    
    print('Total reward: ', r)
env.render()        
env.close()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)
Total reward:  8
Total reward:  9
Total reward:  8
Total reward:  11
Total reward:  7
Total reward:  9
Total reward:  5
Total reward:  6
Total reward:  -2
Total reward:  6
Total reward:  8
Total reward:  9
Total reward:  10
Total reward:  11
Total reward:  5
Total reward:  9
Total reward:  12
Total reward:  9
Total reward:  9
Total reward:  9
Total reward:  5
Total reward:  6
Total reward:  8
Total reward:  8
Total reward:  -3
Total reward:  11
Total reward:  7
Total reward:  6
Total reward:  10
Total reward:  6
Total reward:  7
Total reward:  6
Total reward:  11
Total reward:  7
Total reward:  9
Total reward:  8
Total reward:  11
Total reward:  7
Total reward:  8
Total reward:  11
Total reward:  10
Total reward:  9
Total reward:  7
Total reward:  12
Total reward:  9
Total reward:  6
Total reward:  8
Total reward:  12
Total reward:  8
Total reward:  -3
Total reward:  -1
Tot

Total reward:  9
Total reward:  5
Total reward:  11
Total reward:  8
Total reward:  8
Total reward:  -1
Total reward:  5
Total reward:  9
Total reward:  8
Total reward:  10
Total reward:  9
Total reward:  7
Total reward:  12
Total reward:  9
Total reward:  9
Total reward:  10
Total reward:  4
Total reward:  14
Total reward:  10
Total reward:  -5
Total reward:  9
Total reward:  10
Total reward:  10
Total reward:  7
Total reward:  3
Total reward:  4
Total reward:  5
Total reward:  7
Total reward:  6
Total reward:  10
Total reward:  12
Total reward:  7
Total reward:  7
Total reward:  -5
Total reward:  11
Total reward:  9
Total reward:  4
Total reward:  8
Total reward:  8
Total reward:  6
Total reward:  11
Total reward:  9
Total reward:  9
Total reward:  8
Total reward:  11
Total reward:  5
Total reward:  12
Total reward:  8
Total reward:  10
Total reward:  -1
Total reward:  10
Total reward:  9
Total reward:  3
Total reward:  9
Total reward:  10
Total reward:  -5
Total reward:  6
Total rew

Total reward:  9
Total reward:  9
Total reward:  12
Total reward:  5
Total reward:  12
Total reward:  7
Total reward:  9
Total reward:  3
Total reward:  9
Total reward:  9
Total reward:  8
Total reward:  12
Total reward:  7
Total reward:  7
Total reward:  7
Total reward:  12
Total reward:  10
Total reward:  14
Total reward:  9
Total reward:  7
Total reward:  10
Total reward:  5
Total reward:  9
Total reward:  7
Total reward:  6
Total reward:  10
Total reward:  12
Total reward:  9
Total reward:  9
Total reward:  1
Total reward:  10
Total reward:  8
Total reward:  6
Total reward:  10
Total reward:  8
Total reward:  14
Total reward:  10
Total reward:  6
Total reward:  4
Total reward:  7
Total reward:  4
Total reward:  6
Total reward:  13
Total reward:  10
Total reward:  7
Total reward:  11
Total reward:  8
Total reward:  10
Total reward:  9
Total reward:  7
Total reward:  7
Total reward:  8
Total reward:  8
Total reward:  6
Total reward:  11
Total reward:  -6
Total reward:  7
Total reward

Total reward:  8
Total reward:  13
Total reward:  -2
Total reward:  4
Total reward:  10
Total reward:  5
Total reward:  5
Total reward:  5
Total reward:  11
Total reward:  9
Total reward:  10
Total reward:  4
Total reward:  13
Total reward:  10
Total reward:  4
Total reward:  9
Total reward:  6
Total reward:  12
Total reward:  7
Total reward:  -1
Total reward:  8
Total reward:  7
Total reward:  7
Total reward:  8
Total reward:  7
Total reward:  10
Total reward:  9
Total reward:  7
Total reward:  7
Total reward:  5
Total reward:  7
Total reward:  5
Total reward:  8
Total reward:  8
Total reward:  12
Total reward:  6
Total reward:  7
Total reward:  8
Total reward:  11
Total reward:  7
Total reward:  9
Total reward:  9
Total reward:  9
Total reward:  10
Total reward:  11
Total reward:  9
Total reward:  8
Total reward:  9
Total reward:  9
Total reward:  1
Total reward:  8
Total reward:  9
Total reward:  8
Total reward:  9
Total reward:  6
Total reward:  7
Total reward:  -1
Total reward:  6

Total reward:  6
Total reward:  9
Total reward:  6
Total reward:  5
Total reward:  -4
Total reward:  7
Total reward:  8
Total reward:  1
Total reward:  9
Total reward:  8
Total reward:  9
Total reward:  11
Total reward:  12
Total reward:  3
Total reward:  9
Total reward:  4
Total reward:  6
Total reward:  5
Total reward:  -4
Total reward:  9
Total reward:  5
Total reward:  7
Total reward:  12
Total reward:  12
Total reward:  12
Total reward:  7
Total reward:  9
Total reward:  7
Total reward:  -5
Total reward:  9
Total reward:  11
Total reward:  7
Total reward:  10
Total reward:  9
Total reward:  6
Total reward:  12
Total reward:  8
Total reward:  7
Total reward:  11
Total reward:  8
Total reward:  6
Total reward:  5
Total reward:  9
Total reward:  12
Total reward:  6
Total reward:  5
Total reward:  9
Total reward:  12
Total reward:  8
Total reward:  7
Total reward:  7
Total reward:  4
Total reward:  12
Total reward:  3
Total reward:  12
Total reward:  8
Total reward:  7
Total reward:  

Total reward:  6
Total reward:  10
Total reward:  10
Total reward:  8
Total reward:  11
Total reward:  12
Total reward:  6
Total reward:  4
Total reward:  8
Total reward:  8
Total reward:  6
Total reward:  6
Total reward:  7
Total reward:  6
Total reward:  10
Total reward:  9
Total reward:  13
Total reward:  11
Total reward:  -3
Total reward:  3
Total reward:  6
Total reward:  7
Total reward:  14
Total reward:  9
Total reward:  6
Total reward:  8
Total reward:  8
Total reward:  11
Total reward:  7
Total reward:  9
Total reward:  6
Total reward:  8
Total reward:  7
Total reward:  4
Total reward:  8
Total reward:  5
Total reward:  5
Total reward:  6
Total reward:  10
Total reward:  10
Total reward:  7
Total reward:  7
Total reward:  7
Total reward:  8
Total reward:  15
Total reward:  -4
Total reward:  7
Total reward:  14
Total reward:  6
Total reward:  10
Total reward:  12
Total reward:  3
Total reward:  7
Total reward:  9
Total reward:  -1
Total reward:  13
Total reward:  12
Total rewar

Total reward:  8
Total reward:  6
Total reward:  7
Total reward:  6
Total reward:  6
Total reward:  9
Total reward:  9
Total reward:  8
Total reward:  7
Total reward:  11
Total reward:  11
Total reward:  8
Total reward:  7
Total reward:  13
Total reward:  4
Total reward:  13
Total reward:  1
Total reward:  9
Total reward:  6
Total reward:  7
Total reward:  7
Total reward:  9
Total reward:  11
Total reward:  3
Total reward:  9
Total reward:  7
Total reward:  6
Total reward:  7
Total reward:  11
Total reward:  8
Total reward:  10
Total reward:  11
Total reward:  8
Total reward:  10
Total reward:  10
Total reward:  12
Total reward:  14
Total reward:  9
Total reward:  7
Total reward:  8
Total reward:  11
Total reward:  3
Total reward:  8
Total reward:  9
Total reward:  7
Total reward:  6
Total reward:  6
Total reward:  -3
Total reward:  5
Total reward:  10
Total reward:  11
Total reward:  8
Total reward:  14
Total reward:  11
Total reward:  5
Total reward:  10
Total reward:  9
Total reward

Total reward:  9
Total reward:  10
Total reward:  7
Total reward:  5
Total reward:  6
Total reward:  10
Total reward:  8
Total reward:  5
Total reward:  12
Total reward:  4
Total reward:  1
Total reward:  7
Total reward:  7
Total reward:  12
Total reward:  10
Total reward:  9
Total reward:  -4
Total reward:  6
Total reward:  7
Total reward:  1
Total reward:  1
Total reward:  8
Total reward:  8
Total reward:  8
Total reward:  14
Total reward:  8
Total reward:  7
Total reward:  -2
Total reward:  -6
Total reward:  10
Total reward:  8
Total reward:  12
Total reward:  9
Total reward:  4
Total reward:  8
Total reward:  9
Total reward:  13
Total reward:  13
Total reward:  9
Total reward:  4
Total reward:  9
Total reward:  6
Total reward:  9
Total reward:  4
Total reward:  9
Total reward:  4
Total reward:  13
Total reward:  8
Total reward:  11
Total reward:  10
Total reward:  5
Total reward:  7
Total reward:  6
Total reward:  9
Total reward:  -3
Total reward:  7
Total reward:  6
Total reward: 

Total reward:  4
Total reward:  -5
Total reward:  7
Total reward:  9
Total reward:  5
Total reward:  9
Total reward:  5
Total reward:  8
Total reward:  5
Total reward:  -1
Total reward:  -3
Total reward:  11
Total reward:  6
Total reward:  2
Total reward:  12
Total reward:  7
Total reward:  10
Total reward:  9
Total reward:  12
Total reward:  6
Total reward:  9
Total reward:  8
Total reward:  9
Total reward:  6
Total reward:  6
Total reward:  4
Total reward:  2
Total reward:  8
Total reward:  7
Total reward:  11
Total reward:  9
Total reward:  8
Total reward:  8
Total reward:  6
Total reward:  10
Total reward:  4
Total reward:  5
Total reward:  8
Total reward:  6
Total reward:  7
Total reward:  7
Total reward:  8
Total reward:  11
Total reward:  9
Total reward:  8
Total reward:  9
Total reward:  0
Total reward:  9
Total reward:  8
Total reward:  8
Total reward:  3
Total reward:  5
Total reward:  12
Total reward:  8
Total reward:  11
Total reward:  8
Total reward:  5
Total reward:  7
To

Total reward:  9
Total reward:  6
Total reward:  10
Total reward:  9
Total reward:  9
Total reward:  13
Total reward:  -3
Total reward:  11
Total reward:  11
Total reward:  9
Total reward:  10
Total reward:  9
Total reward:  12
Total reward:  -3
Total reward:  5
Total reward:  8
Total reward:  9
Total reward:  9
Total reward:  7
Total reward:  6
Total reward:  10
Total reward:  6
Total reward:  9
Total reward:  7
Total reward:  8
Total reward:  6
Total reward:  10
Total reward:  9
Total reward:  9
Total reward:  14
Total reward:  6
Total reward:  7
Total reward:  13
Total reward:  6
Total reward:  10
Total reward:  10
Total reward:  6
Total reward:  8
Total reward:  5
Total reward:  10
Total reward:  7
Total reward:  9
Total reward:  13
Total reward:  10
Total reward:  9
Total reward:  5
Total reward:  5
Total reward:  8
Total reward:  -5
Total reward:  3
Total reward:  7
Total reward:  10
Total reward:  8
Total reward:  6
Total reward:  8
Total reward:  8
Total reward:  10
Total rewar

Total reward:  10
Total reward:  9
Total reward:  10
Total reward:  8
Total reward:  6
Total reward:  8
Total reward:  9
Total reward:  9
Total reward:  9
Total reward:  6
Total reward:  11
Total reward:  8
Total reward:  7
Total reward:  8
Total reward:  13
Total reward:  7
Total reward:  10
Total reward:  9
Total reward:  11
Total reward:  11
Total reward:  9
Total reward:  12
Total reward:  9
Total reward:  8
Total reward:  5
Total reward:  5
Total reward:  6
Total reward:  10
Total reward:  11
Total reward:  6
Total reward:  5
Total reward:  10
Total reward:  11
Total reward:  9
Total reward:  9
Total reward:  8
Total reward:  11
Total reward:  9
Total reward:  9
Total reward:  13
Total reward:  4
Total reward:  9
Total reward:  14
Total reward:  13
Total reward:  4
Total reward:  11
Total reward:  7
Total reward:  10
Total reward:  9
Total reward:  11
Total reward:  10
Total reward:  11
Total reward:  10
Total reward:  9
Total reward:  7
Total reward:  10
Total reward:  7
Total re