<a href="https://colab.research.google.com/github/Isaivargas/machineLearningAgents/blob/master/frozenLakeQlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**The goal of this game is to go from the starting state (S) to the goal state (G) by walking only on frozen tiles (F) and avoid holes (H).However, the ice is slippery, so you won't always move in the direction you intend (stochastic environment)**

---



In [0]:
import numpy as np
import gym
import random

The Environment is the Frozen Lake from **Open AI **


In [0]:
env = gym.make("FrozenLake-v0")

Step #1 
Creation of Q Table and initialization.



In [0]:
action_size = env.action_space.n
state_size  = env.observation_space.n

In [19]:
# Nrows (states) & Mcolumns(actions).
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Create the hyperparameters.

In [0]:
total_episodes   = 15000        # Total episodes
learning_rate    = 0.1          # Learning rate
max_steps        = 99           # Max steps per episode
gamma            = 0.95         # Discounting rate

# Exploration parameters
epsilon          = 1.0          # Exploration rate
max_epsilon      = 1.0          # Exploration probability at start
min_epsilon      = 0.01         # Minimum exploration probability 
decay_rate       = 0.001        # Exponential decay rate for exploration prob

The Q learning algorithm 🧠
implementation of the Q learning algorithm


In [21]:
# List of rewards
rewards = []

# Step 2 For life or until learning is stopped.
# Loop of episodes (Trajectories).
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    # Loop of steps.
    for step in range(max_steps):
        # Step 3. Choose an action a in the current world state (s)
        ## Generate a random number.
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> Produce an exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> Produce an exploration.
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.5304
[[0.21737892 0.16166348 0.17840212 0.17027186]
 [0.09387525 0.10138598 0.09394584 0.17135135]
 [0.15023787 0.10673863 0.10321249 0.10865932]
 [0.05904939 0.05553839 0.06568965 0.10872124]
 [0.24891298 0.17254694 0.12632492 0.13247031]
 [0.         0.         0.         0.        ]
 [0.07043108 0.04159124 0.11060563 0.05529311]
 [0.         0.         0.         0.        ]
 [0.13279136 0.17710833 0.19482411 0.31036498]
 [0.14869659 0.41772571 0.30244281 0.20835614]
 [0.38339312 0.23482281 0.2365729  0.17984912]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.20713168 0.336669   0.5565108  0.3767499 ]
 [0.57298619 0.74720964 0.55940739 0.57545256]
 [0.         0.         0.         0.        ]]


In [22]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 8
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 42
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 80
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 62
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 61
