In [1]:
import gym
import random
import numpy as np
import time
from collections import deque
import pickle


from collections import defaultdict


EPISODES =  20000
LEARNING_RATE = .1
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .999


In [2]:
def default_Q_value():
    return 0

def epsilon_greedy_search(Epsilon, qtable, state):
    if (random.random() < Epsilon):
        return env.action_space.sample()
    else:
        # get the best move for the current state
        return best_move_for_a_state(Q_table=qtable, state=state)

def best_move_for_a_state(Q_table, state):
    candidates = []
    vals = []

    # I would like to return the best move for a given state
    for i in Q_table:
        if i[0] == state:
            candidates.append(i)
            vals.append(Q_table[i])
    if vals:
        return candidates[np.argmax(vals)][1]
    # if we don't have any values in our q-table for this state, just return a random action
    return env.action_space.sample()

def max_a_prime(Q_table, state):
    candidates = []
    vals = []

    # I would like to return the maximum value of Q(s', a') over all possible a' values
    for i in Q_table:
        if i[0] == state:
            candidates.append(i)
            vals.append(Q_table[i])
    if vals:
        return Q_table[candidates[np.argmax(vals)]]
    return 0

In [3]:
# adapted from CS 540 HW 10

random.seed(1)
np.random.seed(1)
env = gym.envs.make("FrozenLake-v1")


# You will need to update the Q_table in your iteration
Q_table = defaultdict(default_Q_value) 
# starts with a pessimistic estimate of zero reward for each state.

episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    obs = env.reset()

    # perform a=epsilon-greedy(Q, s), receive r, s'
    # r is the reward, s' is the next state
    # break any ties arbitrarily

    while (not done):

        # want this to perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        action = epsilon_greedy_search(Epsilon=EPSILON, qtable=Q_table, state=obs)

        oldObs = obs
        obs,reward,done,info = env.step(action)
        Q_table[(oldObs, action)] = (1-LEARNING_RATE) * Q_table[(oldObs, action)] + (LEARNING_RATE) * (reward + DISCOUNT_FACTOR * (max_a_prime(Q_table, obs)))

        episode_reward += reward # update episode reward

    # now decay the epsilon
    EPSILON *= EPSILON_DECAY


    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    
    if i%100 ==0 and i>0:
        print("LAST 100 EPISODE AVERAGE REWARD: " + str(sum(list(episode_reward_record))/100))
        print("EPSILON: " + str(EPSILON) )



LAST 100 EPISODE AVERAGE REWARD: 0.0
EPSILON: 0.9038873549665959
LAST 100 EPISODE AVERAGE REWARD: 0.0
EPSILON: 0.8178301806491574
LAST 100 EPISODE AVERAGE REWARD: 0.0
EPSILON: 0.7399663251239436
LAST 100 EPISODE AVERAGE REWARD: 0.0
EPSILON: 0.6695157201007336
LAST 100 EPISODE AVERAGE REWARD: 0.0
EPSILON: 0.6057725659163237
LAST 100 EPISODE AVERAGE REWARD: 0.02
EPSILON: 0.548098260578011
LAST 100 EPISODE AVERAGE REWARD: 0.02
EPSILON: 0.4959150020176678
LAST 100 EPISODE AVERAGE REWARD: 0.05
EPSILON: 0.44869999946146477
LAST 100 EPISODE AVERAGE REWARD: 0.02
EPSILON: 0.4059802359226587
LAST 100 EPISODE AVERAGE REWARD: 0.08
EPSILON: 0.36732772934619257
LAST 100 EPISODE AVERAGE REWARD: 0.08
EPSILON: 0.33235524492954527
LAST 100 EPISODE AVERAGE REWARD: 0.1
EPSILON: 0.3007124156643058
LAST 100 EPISODE AVERAGE REWARD: 0.17
EPSILON: 0.2720822322326576
LAST 100 EPISODE AVERAGE REWARD: 0.13
EPSILON: 0.2461778670932771
LAST 100 EPISODE AVERAGE REWARD: 0.21
EPSILON: 0.22273980093919937
LAST 100 EPIS