In [18]:
import numpy as np
import gym
frozenLake4x4 = 'SSSS\nSHSH\nSSSS\nHSSF'


print(frozenLake4x4)


SSSS
SHSH
SSSS
HSSF


In [54]:
import sys
from contextlib import closing

import numpy as np
from six import StringIO, b

from gym import utils
from gym.envs.toy_text import discrete

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

# DISCOUNT = 1.0
# STEP_REWARD = 0.0
# LOSE_REWARD = 0.0
# WIN_REWARD = 1.0

DISCOUNT = .8
STEP_REWARD = 0
LOSE_REWARD = 0
WIN_REWARD = 1.0

MAPS = {
    "4x4": [
        "SFFF",
        "FHFH",
        "FFFH",
        "HFFG"
    ],
    "8x8": [
        "SFFFFFFF",
        "FFFFFFFF",
        "FFFHFFFF",
        "FFFFFHFF",
        "FFFHFFFF",
        "FHHFFFHF",
        "FHFFHFHF",
        "FFFHFFFG"
    ]
}


def generate_random_map(size=8, p=0.8):
    """Generates a random valid map (one that has a path from start to goal)
    :param size: size of each side of the grid
    :param p: probability that a tile is frozen
    """
    valid = False

    # DFS to check that it's a valid path.
    def is_valid(res):
        frontier, discovered = [], set()
        frontier.append((0,0))
        while frontier:
            r, c = frontier.pop()
            if not (r,c) in discovered:
                discovered.add((r,c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    if res[r_new][c_new] == 'G':
                        return True
                    if (res[r_new][c_new] not in '#H'):
                        frontier.append((r_new, c_new))
        return False

    while not valid:
        p = min(1, p)
        res = np.random.choice(['F', 'H'], (size, size), p=[p, 1-p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
    return ["".join(x) for x in res]


class FrozenLakeEnv(discrete.DiscreteEnv):
    """
    Winter is here. You and your friends were tossing around a frisbee at the park
    when you made a wild throw that left the frisbee out in the middle of the lake.
    The water is mostly frozen, but there are a few holes where the ice has melted.
    If you step into one of those holes, you'll fall into the freezing water.
    At this time, there's an international frisbee shortage, so it's absolutely imperative that
    you navigate across the lake and retrieve the disc.
    However, the ice is slippery, so you won't always move in the direction you intend.
    The surface is described using a grid like the following

        SFFF
        FHFH
        FFFH
        HFFG

    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located

    The episode ends when you reach the goal or fall in a hole.
    You receive a reward of 1 if you reach the goal, and zero otherwise.

    """

    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self, desc=None, map_name="4x4",is_slippery=True):
        if desc is None and map_name is None:
            desc = generate_random_map()
        elif desc is None:
            desc = MAPS[map_name]
        self.desc = desc = np.asarray(desc,dtype='c')
        self.nrow, self.ncol = nrow, ncol = desc.shape
        self.reward_range = (0, 1)

        nA = 4
        nS = nrow * ncol

        isd = np.array(desc == b'S').astype('float64').ravel()
        isd /= isd.sum()

        P = {s : {a : [] for a in range(nA)} for s in range(nS)}

        def to_s(row, col):
            return row*ncol + col

        def inc(row, col, a):
            if a == LEFT:
                col = max(col-1,0)
            elif a == DOWN:
                row = min(row+1,nrow-1)
            elif a == RIGHT:
                col = min(col+1,ncol-1)
            elif a == UP:
                row = max(row-1,0)
            return (row, col)

        for row in range(nrow):
            for col in range(ncol):
                s = to_s(row, col)
                for a in range(4):
                    li = P[s][a]
                    letter = desc[row, col]
                    if letter in b'GH':
                        li.append((1.0, s, 0, True))
                    else:
                        if is_slippery:
                            for b in [(a-1)%4, a, (a+1)%4]:
                                newrow, newcol = inc(row, col, b)
                                newstate = to_s(newrow, newcol)
                                newletter = desc[newrow, newcol]
                                done = bytes(newletter) in b'GH'
                                rew = float(newletter == b'G')
                                li.append((1.0/3.0, newstate, rew, done))
                        else:
                            newrow, newcol = inc(row, col, a)
                            newstate = to_s(newrow, newcol)
                            newletter = desc[newrow, newcol]
                            done = bytes(newletter) in b'GH'
                            rew = float(newletter == b'G')
                            li.append((1.0, newstate, rew, done))

        super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout

        row, col = self.s // self.ncol, self.s % self.ncol
        desc = self.desc.tolist()
        desc = [[c.decode('utf-8') for c in line] for line in desc]
        desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
        if self.lastaction is not None:
            outfile.write("  ({})\n".format(["Left","Down","Right","Up"][self.lastaction]))
        else:
            outfile.write("\n")
        outfile.write("\n".join(''.join(line) for line in desc)+"\n")

        if mode != 'human':
            with closing(outfile):
                return outfile.getvalue()
            
def value_iteration(env, max_iterations=100000, lmbda=0.9):
  stateValue = [0 for i in range(env.nS)]
  newStateValue = stateValue.copy()
  for i in range(max_iterations):
    for state in range(env.nS):
      action_values = []      
      for action in range(env.nA):
        state_value = 0
        for i in range(len(env.P[state][action])):
          prob, next_state, reward, done = env.P[state][action][i]
          state_action_value = prob * (reward + lmbda*stateValue[next_state])
          state_value += state_action_value
        action_values.append(state_value)      #the value of each action
        best_action = np.argmax(np.asarray(action_values))   # choose the action which gives the maximum value
        newStateValue[state] = action_values[best_action]  #update the value of the state
    if i > 1000: 
      if sum(stateValue) - sum(newStateValue) < 1e-04:   # if there is negligible difference break the loop
        break
#         print(i)
    else:
      stateValue = newStateValue.copy()
  return stateValue
   
def get_policy(env,stateValue, lmbda=0.9):
  policy = [0 for i in range(env.nS)]
  for state in range(env.nS):
    action_values = []
    for action in range(env.nA):
      action_value = 0
      for i in range(len(env.P[state][action])):
        prob, next_state, r, _ = env.P[state][action][i]
        action_value += prob * (r + lmbda * stateValue[next_state])
      action_values.append(action_value)
    best_action = np.argmax(np.asarray(action_values))
    policy[state] = best_action
  return policy 

def get_score(env, policy, episodes=1000):
  misses = 0
  steps_list = []
  for episode in range(episodes):
    observation = env.reset()
    steps=0
    while True:
      
      action = policy[observation]
      observation, reward, done, _ = env.step(action)
      steps+=1
      if done and reward == 1:
        # print('You have got the fucking Frisbee after {} steps'.format(steps))
        steps_list.append(steps)
        break
      elif done and reward == 0:
        # print("You fell in a hole!")
        misses += 1
        break
  print('----------------------------------------------')
  print('You took an average of {:.0f} steps to get the frisbee'.format(np.mean(steps_list)))
  print('And you fell in the hole {:.2f} % of the times'.format((misses/episodes) * 100))
  print('----------------------------------------------')

def random_policy(env):
    return np.random.randint(0, 4, size=env.nS)

def one_step_lookahead(env, s, value_function):
    action_values = np.zeros(env.nA)
    for a in range(env.nA):
        value = avg_reward(env, s, a)
        for p, next_s, _, _ in env.P[s][a]:
            value += DISCOUNT * p * value_function[next_s]
        action_values[a] = value
    return action_values
    
def evaluate_policy(env, policy, max_backups=1000, tol=1e-6):
    old_value = np.zeros(env.nS)
    for i in range(max_backups):
        new_value = np.zeros(env.nS)
        for s in range(env.nS):
            action_values = one_step_lookahead(env, s, old_value)
            new_value[s] = action_values[policy[s]]
        if np.max(np.abs(new_value-old_value)) < tol:
            break
        old_value = new_value
    return new_value

def greedy_policy(env, value_function):
    policy = np.zeros(env.nS, dtype=np.int32)
    for s in range(env.nS):
        action_values = one_step_lookahead(env, s, value_function)
        policy[s] = np.argmax(action_values)
    return policy

def policy_iteration(env, max_steps=100):
    old_policy = random_policy(env)
    for i in range(max_steps):
        value_function = evaluate_policy(env, old_policy)
        new_policy = greedy_policy(env, value_function)
        
        if np.array_equal(new_policy, old_policy):
            break
        old_policy = new_policy
    return old_policy, value_function

def avg_reward(env, s, a):
    avg_reward = 0
    for prob, next_s, reward, done in env.P[s][a]:
        if not done:
            avg_reward += prob * STEP_REWARD
        elif reward == 0.0:
#             avg_reward += prob * (-5)
            avg_reward += prob * LOSE_REWARD
        else:
#             avg_reward += prob * 10
            avg_reward += prob * WIN_REWARD
    return avg_reward

In [60]:
env = gym.make('FrozenLake-v0')
print(env.R)
# env = FrozenLakeEnv(map_name='8x8')
# print(env.P[0])
# values = value_iteration(frozen)
# p1 = get_policy(frozen, values)
# p2 = policy_iteration(frozen, max_steps=50)
# print(np.array(p1))
# print(p2[0])


# get_score(frozen, p2[0])



AttributeError: 'FrozenLakeEnv' object has no attribute 'R'