In [1]:
import gym
import math
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import KBinsDiscretizer
import time

In [2]:
def plot_learning(history, policy_name):
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x = [i for i in range(len(history))],
            y = history,
        )
    )

    fig.update_yaxes(title_text="Final rewards")
    fig.update_xaxes(title_text="i_games")
    fig.update_layout(title_text=policy_name.upper())
    
    return fig

In [3]:
env = gym.make('CartPole-v1')

Goal of Reinforcement learning is to find an optimal policy that will lead to most reward.

In this tutorial we will go from random sampling to Q-learning which is a model-free learning. By going step by step we will clearly understand limitations of each methods.

#### Policy 1: Random Sampling

In our example of CartPole-v1, there are only two actions at any given state therefore we will pick "left" or "right" at random.

In [11]:
def random_policy():
    action = np.random.choice(env.action_space.n)
    return action

In [12]:
num_games = 100
history = []

state = env.reset()
env.render()

for game_i in range(num_games):
    state = env.reset()
    done = False
    final_reward = 0
    while not done:
        action = random_policy()
        next_s, reward, done, info = env.step(action)
        final_reward += reward
        env.render()
        
    history.append(final_reward)
env.close()

In [13]:
policy_name = 'random policy'
fig = plot_learning(history, policy_name)
fig.write_html(f"CartPole-v1_{policy_name}.html")

Plotting how the agent performs on 100 games we can see that its policy is outputting random rewards. Next we will try to develop our policy to obtain greater reward.

#### Policy 2: Follow tilt

This policy makes cart move in the direction of the pole 

In [14]:
def follow_tilt(state):
    """Given current state information move cart in direction of the pole tilt"""
    pole_angle = state[2]
    if pole_angle > 0:
        action = 1
    else:
        action = 0
    return action

In [15]:
num_games = 100
history = []
state = env.reset()
env.render()

for game_i in range(num_games):
    state = env.reset()
    done = False
    final_reward = 0
    while not done:
        action = follow_tilt(state)
        next_s, reward, done, info = env.step(action)
        final_reward += reward
        state = next_s
        env.render()
        
    history.append(final_reward)
env.close()

In [16]:
policy_name = 'follow_tilt policy'
fig = plot_learning(history, policy_name)
fig.write_html(f"CartPole-v1_{policy_name}.html")

Even though it does better than our random policy we've created a policy using our knowledge of the environment. In most cases knowledge of the environment is not given and also this policy is only suitable for this particular type of environment, not generalized well enough.

To make our policy generalizable we will use __Q-learning__ which is a __model free__(No knowledge of environment is needed) policy to maximize our pole and see if it indeed generalizes to other problems. 

#### Policy 3: Q-learning

In [96]:
def q_choose_action(state, epsilon):
    """Given state, use q_learning policy to take action"""
    if np.random.uniform() < epsilon:
        action = np.random.choice(2)
    else:
        q_vals = q_table[state]
        action = np.argmax(q_vals)
#         perm_actions = np.random.permutation(2)
#         q_vals = [q_vals[a] for a in perm_actions] # randomizing q_vals -> why?
#         perm_q_argmax = np.argmax(q_vals)
#         action = perm_actions[perm_q_argmax]
    return action

def update_q(transition):
    """Given information and after taking action following q_learning policy we update our q_table"""
    s, a, r, next_s, done = transition
    q_val = q_table[s][a] #여기서 꺼내온걸 q_target을 사용하여 업데이트(learn) 해준다
    if done:
        q_target = r
    else:
        q_target = r + discount_value*np.max(q_table[next_s])

    q_table[s][a] += learning_rate*(q_target - q_val)
    return q_table

q_table => for each possible state we should have probabilities of all possible actions, i.e, for each unique angle and pole_velocity pair(state) it should have its own associated action probabilities which continuously get updated using q_learning

In [91]:
n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]

est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
est.fit([lower_bounds, upper_bounds ])

def discretizer(cp, cv, angle, pole_velocity):
    """Convert continues state intro a discrete state"""
    return tuple(map(int, est.transform([[angle, pole_velocity]])[0]))

In [97]:
from collections import defaultdict

n_action_space = 2
learning_rate = 0.01 #alpha
discount_value = 0.9 #gamma
epsilon = 0.9
epsilon_min = 0.1
epsilon_decay = 0.95
q_table = np.zeros(n_bins + (n_action_space, ))

num_games = 100
history = []
q_tables = []
for game_i in range(num_games):
    if game_i % 10 == 0:
        q_tables.append(q_table)
    state = discretizer(*env.reset()) # passing observations of resetted state
    done = False
    final_reward = 0
    env.render()
    while not done:
        action = q_choose_action(state, epsilon)
        next_s, reward, done, info = env.step(action)
        next_s = discretizer(*next_s)
        q_table = update_q((state, action, reward, next_s, done), q_table)# q_table doesn't seem to get updated....
        
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
        
        final_reward += reward
        state = next_s
        env.render()
        
    history.append(final_reward)
env.close()

In [103]:
q_tables[0] == q_tables[-1]

array([[[ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ True,  True],
        [ 

In [62]:
policy_name = 'Q-learning'
fig = plot_learning(history, policy_name)
fig.write_html(f"CartPole-v1_{policy_name}.html")

References:
- https://www.youtube.com/watch?v=JNKvJEzuNsc&t=64s