In [6]:
import numpy as np
import pandas as pd
import time

np.random.seed(6)
N_STATES = 6
ACTIONS = ['left','right']
EPSILON = 0.95
ALPHA = 0.1
GAMMA = 0.9
MAX_EPISODES = 13
FRESH_TIME = 0.3

def build_q_table(n_states,actions):
    table = pd.DataFrame(np.zeros((n_states,len(actions))),columns=actions)
    return table

def update_env(S,episode,step_counter):
    env_list = ['-'] * (N_STATES -1) + ['T']
    if S == 'terminal':
        # '---------T' our environment
        interaction = 'Episode {ep}: total_steps = {counter}'.format(ep=episode + 1, counter=step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)

def choose_action(state,q_table):
    state_actions = q_table.iloc[state,:]
    if (np.random.uniform()>EPSILON) or (state_actions.all() == 0):
        action_name = np.random.choice(ACTIONS)
    else:
        action_name = state_actions.idxmax()
    return action_name

def get_env_feedback(S,A):
    if A == 'right':
        if S == N_STATES -2:
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:
        R = 0
        if S == 0:
            S_ = S
        else:
            S_ = S -1
    return S_,R
        
def rl():
    q_table = build_q_table(N_STATES,ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S =0
        is_terminated = False
        update_env(S,episode,step_counter)
        while not is_terminated:
            A = choose_action(S,q_table)
            S_, R = get_env_feedback(S,A)
            q_predict = q_table.ix[S,A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_,:].max()
            else:
                q_target = R
                is_terminated = True
            q_table.ix[S,A] += ALPHA * (q_target - q_predict)
            S = S_
            step_counter += 1
            update_env(S,episode,step_counter)
    return q_table

In [7]:
q_table = rl()
print('\r\nQ-table:\n')
print(q_table)

o----T

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


-o---T

--o--T

-o---T

--o--T

---o-T

----oT

---o-T

--o--T

---o-T

----oT

Episode 1: total_steps = 11

                                o----T

o----T

-o---T

--o--T

-o---T

--o--T

-o---T

--o--T

-o---T

o----T

o----T

o----T

-o---T

--o--T

---o-T

--o--T

---o-T

--o--T

-o---T

--o--T

---o-T

--o--T

-o---T

o----T

-o---T

o----T

-o---T

o----T

o----T

-o---T

--o--T

-o---T

o----T

o----T

o----T

-o---T

--o--T

---o-T

--o--T

---o-T

----oT

Episode 2: total_steps = 41

                                o----T

o----T

-o---T

--o--T

-o---T

o----T

-o---T

o----T

o----T

o----T

-o---T

--o--T

---o-T

--o--T

---o-T

----oT

---o-T

----oT

Episode 3: total_steps = 18

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 4: total_steps = 5

                                o----T

o----T

o----T

-o---T

o----T

-o---T

--o--T

-o---T

--o--T

---o-T

----oT

Episode 5: total_steps = 11

                                o----T

o----T

-o---T

--o--T

---o-T

----oT

Episode 6: total_steps = 6

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 7: total_steps = 5

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 8: total_steps = 5

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 9: total_steps = 5

                                o----T

-o---T

o----T

-o---T

--o--T

---o-T

----oT

Episode 10: total_steps = 7

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 11: total_steps = 5

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 12: total_steps = 5

                                o----T

-o---T

--o--T

---o-T

----oT

Episode 13: total_steps = 5

                                
Q-table:

       left     right
0  0.000002  0.005499
1  0.000137  0.027613
2  0.000051  0.113783
3  0.000073  0.345873
4  0.002268  0.745813
5  0.000000  0.000000
