![](https://cdn-images-1.medium.com/max/1600/1*b8WOAfAfdzlsPq6MkkJmOw.jpeg)

In [68]:
import numpy as np

def Q_learning(env, num_episodes = 1000, eps = 1, alpha = 0.1, gamma = 0.9):
    """
    Implements tabular Q-learning
    
    Args:
        env: openai gym environment
        num_episodes (int): Number of episodes
        eps (float): Epsilon for Epsilon-greedy policy, in [0, 1]
        alpha (float): Learning Rate
        gamma (float): Discount factor
    Returns:
    
    """
    
    Q = np.zeros([env.nS, env.nA])
    
    for i in range(num_episodes):
        s = env.reset()
        
        done = False
        while not done:
            a = env.action_space.sample() if np.random.rand() < eps else np.argmax(Q[s, :])
            s_prime, r, done, _ = env.step(a)
            
            Q[s, a] += alpha * (r + gamma * np.max(Q[s_prime, :]) - Q[s, a])
            
            s = np.array(s_prime) 
            
        eps -= eps/num_episodes
        
    return Q

In [69]:
import gym

env = gym.make('FrozenLake-v0', is_slippery = False)

Q = Q_learning(env)

ret = 0

for _ in range(100):
    s = env.reset()

    done = False
    while not done:
        a = np.argmax(Q[s, :])
        s, r, done, _ = env.step(a)
        ret += r
        
ret/100

1.0

In [80]:
import gym

env = gym.make('FrozenLake-v0')

Q = Q_learning(env, num_episodes = 10000, eps = 1, alpha = 0.8, gamma = 0.95)

ret = 0

for _ in range(100):
    s = env.reset()

    done = False
    while not done:
        a = np.argmax(Q[s, :])
        s, r, done, _ = env.step(a)
        ret += r
        
ret/100

0.16

In [79]:
env = gym.make('FrozenLake-v0')

#Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])
# Set learning parameters
lr = .8
y = .95
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)
    
print("Score over time: " +  str(sum(rList)/num_episodes))

Score over time: 0.423


In [58]:
np.argmax(Q, axis=1)

array([2, 3, 0, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 3, 0], dtype=int64)