**Frozen Lake 8x8 V0**

https://gym.openai.com/envs/FrozenLake8x8-v0/

SFFF       (S: starting point, safe)<br>
FHFH       (F: frozen surface, safe)<br>
FFFH       (H: hole, fall to your doom)<br>
HFFG       (G: goal, where the frisbee is located)<br>

## Install  if required 



In [None]:
!pip install gym
!pip install torch

## Import the dependencies 

In [11]:
import matplotlib.pyplot as plt
import time
import gym
import numpy as np
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
from datetime import datetime
import gdown

##Create the environment

In [12]:
environment = 'FrozenLake8x8-v0'
env = gym.make(environment)
state_space = env.observation_space.n
action_space = env.action_space.n

## Help function

In [13]:
# One-hot encodes a state into a tensor
def one_hot_encoding( x, l):
  out_tensor = torch.zeros([1, l])
  out_tensor[0][x] = 1
  return out_tensor

## Load model

In [14]:
class Policy(nn.Module):
    def __init__(self):
        super().__init__()
        self.state_space = env.observation_space.n
        self.action_space = env.action_space.n
        self.hidden = 1000
        self.l1 = nn.Linear(self.state_space, self.hidden, bias=False)
        self.l2 = nn.Linear(self.hidden, self.action_space, bias=False)
    
    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            self.l2,
        )
        return model(x)

In [16]:
url = 'https://drive.google.com/uc?id=1IqvH4tWDnm8b0jMqnzTxDSaGVJUZRBpC'
output = 'state_dict_model.pt'
gdown.download(url, output, False)

Downloading...
From: https://drive.google.com/uc?id=1IqvH4tWDnm8b0jMqnzTxDSaGVJUZRBpC
To: /content/state_dict_model.pt
100%|██████████| 273k/273k [00:00<00:00, 55.8MB/s]


'state_dict_model.pt'

### * if required change dir

In [17]:
policy = Policy()
policy.load_state_dict(torch.load('/content/state_dict_model.pt',map_location=torch.device('cpu')))
policy.eval()

Policy(
  (l1): Linear(in_features=64, out_features=1000, bias=False)
  (l2): Linear(in_features=1000, out_features=4, bias=False)
)

## Play game 1 time

In [18]:
delay = 0.01

In [19]:
state = env.reset()
reward = 0
done = False
policy.eval()
wins = 0
i = 0
    
while not done:
   i +=1    
   env.render()
   maxQ1, action = torch.max(policy(Variable(one_hot_encoding(state, state_space))), 1)
   next_state, reward, done, _ = env.step(action.item())   
   state = next_state
   if done:
     env.render()     
     print('number of steps:', i)
     if reward > 0:
       wins += 1
       print ('------------goal!!!')
     else:
       print ('------------hole...')
     time.sleep(3*delay)
   else:
     time.sleep(delay)   


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
F[41mF[0mFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FF[41mF[0mHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Left)
SFFFFFFF
FFFFFFFF
F[41mF[0mFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
F[41mF[0mFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
F[41mF[0mFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFF

## Play game N ( = test_episodes) times

In [20]:
policy.eval()
wins = 0
env.reset()
test_episodes = 100 #100

for episode in range(test_episodes): 
    print('episode: ', episode)      
    state = env.reset()
    reward = 0
    done = False
    i = 0

    while not done:
        i += 1
        maxQ1, action = torch.max(policy(Variable(one_hot_encoding(state, state_space))), 1)
        next_state, reward, done, _ = env.step(action.item())
        state = next_state 
        if done:
            env.render()
            print('number of steps:', i)
            if reward > 0:
                wins += 1
                print ('------goal')
            else:
                print ('-----hole')
            time.sleep(3*delay)
        else:            
            time.sleep(delay)

print("Goals/Holes: %d/%d" % (wins, test_episodes - wins))
env.close() 

episode:  0
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFF[41mH[0mFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
number of steps: 34
-----hole
episode:  1
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFF[41mH[0mFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
number of steps: 21
-----hole
episode:  2
  (Up)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
F[41mH[0mFFHFHF
FFFHFFFG
number of steps: 21
-----hole
episode:  3
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FH[41mH[0mFFFHF
FHFFHFHF
FFFHFFFG
number of steps: 27
-----hole
episode:  4
  (Up)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
F[41mH[0mFFHFHF
FFFHFFFG
number of steps: 47
-----hole
episode:  5
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFF[41mH[0mFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
number of steps: 66
-----hole
episode:  6
  (Up)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
F[41mH[0mFFHFHF
FFFHFFFG
number of steps: 27
-----hole
episode:  7
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FH[41mH[0mFFFH