In [10]:
import gymnasium as gym
import pandas
import numpy as np

In [None]:
# Build an environment with gym.make()
env = gym.make('FrozenLake-v1',render_mode = "human") # Build a fresh environment

# # Start a new game with env.reset()
current_observation = env.reset() # This starts a new "episode" and returns the initial observation

# # The current observation is just the current location
print(current_observation) # Observations are just a number

In [None]:
# There are fource discrete actions available in this environment
print(f'Our action space: {env.action_space}')

# Randomly sample an action and print the result
new_action = env.action_space.sample()
print(f'Our new action: {new_action}')

In [None]:
new_action = env.action_space.sample()

observation, reward, term, trunc, info = env.step(new_action)

print(f"Action: {new_action}, observation: {observation}, reward: {reward}, terminated: {term}, truncated: {trunc}, info: {info}")

In [None]:
current_observation = env.reset()

term  = False
for episode in range(100):
    print(f'Episode {episode}')

    while not term:
        # Unpack and examine the result from taking a step in the environment
        new_action = env.action_space.sample()
        observation, reward, term, trunc, info = env.step(new_action)
        print(f"Action: {new_action}, observation: {observation}, reward: {reward}, terminated: {term}, truncated: {trunc}, info: {info}")
    
    print("\n")
    term = False
    env.reset()

# Part 1 - Gather Data
In the last section, we have explored how to simulate episodes of the game from the start to a termination point. In this section, we would like to gather more of these episodes to start training an intelligent actor.

We will let the system simulate 1000 episodes of the game and collect the data using pandas. Each row of the data frame is a step, while the features of each rows are

- **`observation`** - the observation at the beginning of the step (before taking the action)
- **`action`** - the randomly sampled action
- **`DCF_reward`** - the distributed reward after each step leading to a final reward is taken. In other words, if the episode leads to the reward, current_reward is the final reward distributed to each step.
- **`reward`** - the accumulated reward per episode. Subsequent to simulating every episode, this is used to calculate the success rate of reaching the goal

Also consider a reward **`success`** that has a value of 1 at each step if the whole episode leads to final reward

In [1]:
import numpy as np
import pandas as pd
import gymnasium as gym

In [2]:
env = gym.make("FrozenLake-v1",render_mode = "rgb_array")

# Intialize a list for storing the trainng data
data = []  

num_episodes = 40000
for episode in range(num_episodes):
    initial_observation = env.reset()[0]
    done = False
    cumsum_reward = 0 # Accumulated reward for each episode
    step = 0
    ep_data = []

    while not(done):
        # Sample a random action in the action space and take a step
        new_action = env.action_space.sample()
        new_observation, current_reward, term, trunc, info = env.step(new_action)

        cumsum_reward += current_reward
        step += 1
        done = term or trunc

        # Append a dictionary of key-value pairs to the list
        ep_data.append({
            # 'eps': episode,
            'step': step,
            'prevObs': initial_observation,
            'act': new_action,
            'currObs': new_observation,
            'reward': current_reward,
            'done': done
        })

        initial_observation = new_observation

    # Adding a linearly discounted reward
    num_steps = len(ep_data)
    for idx, step in enumerate(ep_data):
        step['success'] = cumsum_reward                         # To specify the steps that led to a reward
        step["DCF_reward"] = idx /num_steps * cumsum_reward     # The reward distributed evenly through the steps

    data += ep_data

In [None]:
data_df = pd.DataFrame(data)
pd.set_option('display.max_rows', None)

# df.describe()

reward_per_eps = data_df['reward'].sum()/(episode+1)*100
print(f"\n Success chance: {reward_per_eps}%")

# Part 2 - Predict

In this section, we are hoping to leverage the training data collected in the last example to improve the performance of our agent. We will adopt a supervised learning approach, where the input of the model is a combination of observation and action, and the model output is the expected reward of each of the possible action given a certain observation. 

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR

model = ExtraTreesRegressor(n_estimators = 50)
# model = SVR()

# The proposed structure of reward, weighting the three rewards accordingly
    # The first component of the reward happens at goal states
    # The second component of the rewards encourages actions that leads closer to the reward (since these states have higher DCF_reward)
    # The third component of the reward encourages any action in a sequence in a successful episode
y = 0.5 * data_df.reward + 0.1 * data_df.DCF_reward + data_df.success


x = data_df[["prevObs", "act"]]

model.fit(x, y)

# Step 3 - Act
Now we have a model that predicts the desired behaviour, trained to estimate a reward value when given an observation and an action. We can modify the code used in the data gathering step so that we can replace random actions with more educated ones.

With a random agent, we achieved a success rate of 1-2%. With more informed actions, the success rate should increase by 10x. 

How we can improve the model:
- We can further tune the model for better performance. 
- Try different representations of the observation and actions spaces. 
- Try different models 
- Try different reward structures

In [None]:
model = RandomForestRegressor()
y = 0.5 * data_df.reward + 0.1 * data_df.DCF_reward + data_df.success
x = data_df[['prevObs','act']]
model.fit(x,y)

In [None]:
data_df = pd.DataFrame(data)
pd.set_option('display.max_rows', None)

df.describe()

reward_per_eps = data_df['reward'].sum()/(episode+1)*100
print(f"\n Success chance: {reward_per_eps}%")

In [None]:
num_episodes = 10

data = []  

for episode in range(num_episodes):
    
    initial_observation = env.reset()[0]
    done = False
    cumsum_reward = 0 # Accumulated reward for each episode
    step = 0
    ep_data = []

    print(f"Episode: {episode+1}")

    while not(done):
        # Sample a random action in the action space and take a step
        new_action = env.action_space.sample()

        # # Make informed choice from the trained model
        pred_in = pd.DataFrame([[initial_observation, i] for i in range(4)], columns = ['prevObs','act']) # This is the ['prevObs','act'] pair that is fed into the model
        new_action = np.argmax(model.predict(pred_in))

        # Take the step and record the result of the step
        new_observation, current_reward, term, trunc, info = env.step(new_action)

        cumsum_reward += current_reward
        step += 1
        done = term or trunc

        # Append a dictionary of key-value pairs to the list
        ep_data.append({
            # 'eps': episode,
            'step': step,
            'prevObs': initial_observation,
            'act': new_action,
            'currObs': new_observation,
            'reward': current_reward,
            'done': done
        })

        initial_observation = new_observation

    # Adding a linearly discounted reward
    num_steps = len(ep_data)
    for idx, step in enumerate(ep_data):
        step['success'] = cumsum_reward                         # To specify the steps that led to a reward
        step["DCF_reward"] = idx /num_steps * cumsum_reward     # The reward distributed evenly through the steps

    data += ep_data

data_df = pd.DataFrame(data_df)
data_df