In [1]:
import numpy as np

In [2]:
import gym
import random
import time
from IPython.display import clear_output

In [3]:
env = gym.make("FrozenLake-v0") #take a env from gym

<h3> Frozen Lake </h3>
<img src="FrozenLake.png">

<h3>Q_values initialization</h3>

In [5]:
#Initialize Q values to be 0 - but to do this we need the total number of states and actions possible in the env
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
#now q table is simple zeros with a matrix size states X actions
q_table = np.zeros((state_space_size,action_space_size))
#printing the q_table
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


<h3>Parameters for training</h3>
<p>These are the parameters we play with to see how the agent is learning and efficiency of learning</p>

In [7]:
num_episodes = 10000 #number of episodes we want our agent to play during training (an episode = start state to terminal state)
max_steps_per_episode = 100 #In cases where the agent never gets to the terminal state in an episode, this helps to terminate an episode after the 100th step

learning_rate = 0.1 #rate at the which new q value influences the old q value
discount_rate = 0.99 # future reward discounting rate i:e gamma

exploration_rate = 1
#max and min explorationr rate are just bounds to tell how how the exploration rate can be, you will realise why we used this in a while
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

<h3>Q Learning</h3>

In [9]:
rewards_all_episodes = [] #a list to hold all rewards that we get in each episode

#Q-Learning Algorithm

for episode in range(num_episodes): #Loop telling what happens in each episode
    state = env.reset() #reset the state of the env after each episode to starting state S as shown in above diagram attached 
    
    done = False #keeps a track of whether the episode is done or not
    rewards_current_episode = 0
    
    for steps in range(max_steps_per_episode): #Loop telling what happens in each step of each episode
        
        #First do the exploration vs exploitation trade off
        exploration_rate_threshold = random.uniform(0,1) #helps to decide if the agent explores or exploits in the current time step
        if exploration_rate_threshold > exploration_rate:
            #then choose exploitation i:e choose the action that gives highest reward
            action = np.argmax(q_table[state,:])
        else:
            #choose exploration
            action = env.action_space.sample() #sampling an action randomly from the availble actions of the environment
            
        new_state, reward, done, info = env.step(action) #passing action to env, step() returns a tuple -> new state, the reward for the action we took, whether or not our action ended our episode, some diagnostic information about the env that helps in debugging
        
        #After getting reward by taking an action from the previous step -> update Q table
        
        #update Q Table
        q_table[state, action] = q_table[state, action]*(1-learning_rate) + \
            learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
        
        #np.max(q_table[new_state,:] gives the max value that can be gotten starting from new state and choosing any availble action of the environment
        
        #Update current state as the new state
        state = new_state
        rewards_current_episode+=reward
        
        if done==True:
            break #if episode is done because of falling into Hole or reaching goal, exit the episode and continue with the next episode
    
    # exponential decaying the explorationg rate, hehe -> e = min_e+(max_e-min_e)*e^(-decayrate*episode)
    #Once an episode is done, decay the exploration rate, so slowly exploitation will be dominant action decision maker
    exploration_rate = min_exploration_rate + (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
    
    #Append the current episode's reward to rewards_all_episodes
    rewards_all_episodes.append(rewards_current_episode)
    
#After all episodes are finished - calculate average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000) #splitting the array into multiple sub-arrays each of size num_episodes/1000
count =1000
for r in rewards_per_thousand_episodes:
    print(count, ":", str(sum(r/1000)))
    count+=1000
    
    
#print updated Q table
print("\n\n**************Q-Table***********\n\n")
print(q_table)
        

1000 : 0.04300000000000003
2000 : 0.21100000000000016
3000 : 0.4120000000000003
4000 : 0.5620000000000004
5000 : 0.6230000000000004
6000 : 0.6720000000000005
7000 : 0.6770000000000005
8000 : 0.6660000000000005
9000 : 0.6790000000000005
10000 : 0.6560000000000005


**************Q-Table***********


[[0.53490798 0.46737174 0.47546197 0.44837305]
 [0.26576148 0.39233247 0.30790063 0.46327756]
 [0.39457154 0.39242372 0.39673602 0.41027264]
 [0.2578304  0.32185239 0.2829496  0.39815079]
 [0.56447611 0.37660495 0.37483505 0.40639473]
 [0.         0.         0.         0.        ]
 [0.32329843 0.15687394 0.20649045 0.11045486]
 [0.         0.         0.         0.        ]
 [0.3962683  0.2855091  0.4293383  0.59941775]
 [0.48699391 0.6449265  0.46657338 0.2533498 ]
 [0.67709732 0.35528348 0.38278114 0.3476013 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.48923541 0.60707223 0.73858895 0.46736506]
 [0.71823673 0.8885861  0.7177262  0.70521

In [10]:
#using the obtained policy on the agent
for episode in range(3):
    state =env.reset()
    done = False
    print("*********Episode", episode+1, "********\n\n\n" )
    time.sleep(1) #sleep 1 minute, so we can read the display before it disappears from the screen
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True) #this is IPython display fn that clears the o/p from the current cell in the jupyter notebook & with wait = True it waits to clear the o/p until there is another otuput to show
        
        env.render() #to visually see the agent
        time.sleep(0.3)
        
        action=np.argmax(q_table[state,:]) #take the action that gives max reward out of availble actions as per the learned q_table
        new_state, reward, done, info = env.step(action)
        
        if done:
            
            clear_output(wait=True)
            env.render()
            
            if reward == 1:
                print("*****Yo, You reached the Goal!*******")
                time.sleep(3)
            else:
                print("******You fell in hole, game over!!**********")
                time.sleep(3)
            clear_output(wait=True)
            break
        
        state=new_state
            

  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
******You fell in hole, game over!!**********


In [11]:
print("Hell Yeah!!!!! I Love Reinforcement Learning!!!!!!")

Hell Yeah!!!!! I Love Reinforcement Learning!!!!!!
