<a href="https://colab.research.google.com/github/KhangTran2503/CS114.K21.KHTN/blob/master/Test_MDPs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Reinforcement Learning with OpenAI Gym**
# [OpenAI Gym](https://gym.openai.com/) 

> **OpenAI Gym is a toolkit for developing and comparing reinforcement learning algorithms**. It supports teaching agents everything from walking to playing games like Pong or Pinball.


In [0]:
!pip install gym


In [0]:
import numpy as np
import gym
env = gym.make('FrozenLake-v0')

In [0]:
env.reset()

In [0]:
def value_iteration(env, max_iters,gamma = 0.9):
    v_values = np.zeros(env.observation_space.n)
    for i in range(max_iters):
        prev_v_values = np.copy(v_values)

        #Compute value for each state
        for state in range(env.observation_space.n):
            q_values = []

            # compute q-value for each action
            for action in range(env.action_space.n):
                q_value = 0
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                q_values.append(q_value)
            
            #Select the best action
            best_action = np.argmax(np.asarray(q_values))
            v_values[state] = q_values[best_action]

        #check convergence
        if np.all(np.isclose(v_values,prev_v_values)):
            print('Converged at {}-th iteration.'.format(i))
            break
    return v_values

In [0]:
def policy_extraction(env, v_values,gamma=0.9):
    policy = np.zeros(env.observation_space.n,dtype=np.int)
    #Compute the best action for each state in the game
    #Compute q-values for each (state-action) pair in the game
    for state in range(env.observation_space.n):
        q_values = []
        # Compute q-values for each action
        for action in range(env.action_space.n):
            q_value = 0
            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])
            q_values.append(q_value)
        
        #Select the best action
        best_action = np.argmax(np.asarray(q_values))
        policy[state] = best_action
    return policy


In [0]:
v_values = value_iteration(env,max_iters=1000,gamma=0.9)

In [0]:
policy = policy_extraction(env,v_values,gamma=0.9)

In [0]:
print(policy)

In [0]:
env.render()

In [0]:
def play(env, policy):
    state = env.reset()
    steps = 0
    done = False
    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        steps += 1
        state = next_state
    
    return (reward, steps)


In [0]:
def play_multiple_times(env, policy):
    num_episodes = 1000
    list_of_steps = []
    num_failures = 0

    for i in range(num_episodes):
        reward, steps = play(env, policy)
        if reward == 1:
            list_of_steps.append(steps)
        else:
            num_failures += 1
    
    print('# failures: {}/{}'.format(num_failures,num_episodes))
    print('avg. # step: {}'.format(np.mean(list_of_steps)))


In [0]:
play_multiple_times(env,policy)