In [1]:
import gym
import pandas
import numpy as np

import warnings
warnings.filterwarnings("ignore")

def warn(*args, **kwargs):
    return None

warnings.warn = warn

In [2]:
env = gym.make('FrozenLake-v1')
current_observation = env.reset()
print(current_observation)

0


In [3]:
env.render()

In [4]:
print(f"our action space: {env.action_space}")
new_action = env.action_space.sample()
print(f"our new action: {new_action}")

our action space: Discrete(4)
our new action: 3


In [5]:
new_action = env.action_space.sample()

In [6]:
observation, reward, done, info = env.step(new_action)
print(f"Observation: {observation}, Reward: {reward}, Done: {done}, Info: {info}")
env.render()

Observation: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}


In [7]:
current_observation = env.reset()
for i in range(5):
  new_action = env.action_space.sample()
  observation, reward, done, info = env.step(new_action)
  print(f"Observation: {observation}, Reward: {reward}, Done: {done}, Info: {info}")
  env.render()

Observation: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 4, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 4, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 5, Reward: 0.0, Done: True, Info: {'prob': 0.3333333333333333, 'TimeLimit.truncated': False}
Observation: 5, Reward: 0, Done: True, Info: {'prob': 1.0, 'TimeLimit.truncated': False}


Now we can guess what each of the outputs mean.

**Observation** refers to the number of the tile. The tiles appear to be numbered

    0 1 2 3
    4 5 ...
    
**Reward** refers to the outcome of the game. We get 1 if we win, zero otherwise.

**Done** tells us if the game is still going. It goes to true when we win or fall into a hole.

**info** gives extra info about the world. Here, it's probabilities. Can you guess what this means here? Perhaps the world is a bit noisy.


In [8]:
current_observation = env.reset()
done = False
while not done:
  new_action = env.action_space.sample()
  observation, reward, done, info = env.step(new_action)
  print(f"Observation: {observation}, Reward: {reward}, Done: {done}, Info: {info}")


Observation: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 1, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 4, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
Observation: 5, Reward: 0.0, Done: True, Info: {'prob': 0.3333333333333333, 'TimeLimit.truncated': False}


# Part 1: Gather data

We want to build an intelligent actor but first we have to gather data on which actions are useful.

Use the code above as reference. Run a *random* agent through 1,000 or more episodes and collect data on each step.

I recommend you store this data in a pandas dataframe. Each row should be a step. Your features should include the following features or similar

- `observation` the observation at the beginning of the step (before acting!)
- `action` the action randomly sampled
- `current_reward` the reward received after the action was performed

After you generate this data, it is recommended that you compute a column (such as `total_reward`, that is the total reward for the entire episode).

At the end of the data gathering, you should be able to use pandas (or similar) to calculate the average total reward *per episode* of the random agent. The average score should be 1-2%, meaning that the agent very rarely wins.


## Hints

- `initial_observation = env.reset()` starts a new episode and returns the initial observation.
- `new_observation, reward, done, info = env.step(new_action)` executes one action and returns the following observation. You may look at the documentation for the step method if you are curious about what it does.
- `done != True` until the game is finished.
- we are trying to maximize the reward *per episode*. Our first game gives 0 reward unless the agent travels to the goal.
- `env.action_space.n` gives the number of possible actions in the environment. `env.action_space.sample()` allows the agent to randomly sample an action.
- `env.observation_space.n` gives the number of possible states in the environment.


In [9]:
env = gym.make('FrozenLake-v1')

num_episodes = 40000

life_memory = []
for i in range(num_episodes):

    # start a new episode and record all the memories
    old_observation = env.reset()
    done = False
    tot_reward = 0
    ep_memory = []
    while not done:
        new_action = env.action_space.sample()
        observation, reward, done, info = env.step(new_action)
        tot_reward += reward

        ep_memory.append({
            "observation": old_observation,
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_observation = observation

    # incorporate total reward
    num_steps = len(ep_memory)
    for i, ep_mem in enumerate(ep_memory):
        ep_mem["tot_reward"] = tot_reward
        ep_mem["decay_reward"] = i*tot_reward/num_steps

    life_memory.extend(ep_memory)

memory_df = pandas.DataFrame(life_memory)

In [10]:
memory_df.describe()

Unnamed: 0,observation,action,reward,episode,tot_reward,decay_reward
count,306719.0,306719.0,306719.0,306719.0,306719.0,306719.0
mean,2.236301,1.501345,0.001777,20070.950815,0.023536,0.01088
std,3.018256,1.118838,0.042116,11524.758304,0.151599,0.082858
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,10123.0,0.0,0.0
50%,1.0,2.0,0.0,20057.0,0.0,0.0
75%,4.0,3.0,0.0,30082.0,0.0,0.0
max,14.0,3.0,1.0,39999.0,1.0,0.97561


In [11]:
memory_df.shape

(306719, 6)

In [12]:
memory_df.groupby('episode').reward.sum().mean()

0.013625

# Step 2: Predict

Now that you have a bunch of data put it into a format that you can model. The goal here is to guide the behavior of our agent. Our agent will be given an observation and need to decide between the possible actions given that observation and the prediction of the model.

Remember, you're a data scientist! Be creative.

It might be helpful to work backwards. Ultimately, you will write something like:

```
def convert_to_row(obs, act):
    # expertly written code
    return row_of_obs_act
    
rows = [convert_to_row(current_obs, act) for act in possible_actions]

pred_outcome = model.predict(rows)
```

So, you will need to design a quantity that you can ask your model to predict for every possible action-observation pair. Think a bit about what this quantity should be. Should the model try to predict the immediate reward for each action? If so, how would it know where to go at the beginning of each episode when all moves give zero reward but when some moves bring it closer to the goal than others.


In [13]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR

In [14]:
model = ExtraTreesRegressor(n_estimators=50)
y = 0.5*memory_df.reward + 0.1*memory_df.decay_reward + memory_df.tot_reward
x = memory_df[["observation", "action"]]
model.fit(x, y)

In [15]:
# model_svr = SVR()
# model_svr.fit(x, y)

# Step 3: Act

Now that you have a model that predicts the desired behavior, let's act on it! Modify the code you used to gather data so that you replace the random decision with an intelligent one.

We started out winning ~1.5% of the games with the random agent. How well can you do? You should be able to get your model to do at least 10x better (so 15%). Can you get ~50%?

If you're having trouble, tune your model. Try different representations of the observation and action spaces. Try different models.


In [16]:
model = RandomForestRegressor()
y = 1*memory_df.reward + memory_df.tot_reward + .1*memory_df.decay_reward
x = memory_df[["observation", "action"]]
model.fit(x, y)

In [17]:
num_episodes = 500
random_per = 0

life_memory = []
for i in range(num_episodes):
    # Start a new episode and record all the memories.
    old_observation = env.reset()
    done = False
    tot_reward = 0
    ep_memory = []
    while not done:
        if np.random.rand() < random_per:
            new_action = env.action_space.sample()
        else:
            pred_in = [[old_observation,i] for i in range(4)]
            new_action = np.argmax(model.predict(pred_in))
        observation, reward, done, info = env.step(new_action)
        tot_reward += reward

        ep_memory.append({
            "observation": old_observation,
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_observation = observation

    # incorporate total reward
    for ep_mem in ep_memory:
        ep_mem["tot_reward"] = tot_reward

    life_memory.extend(ep_memory)

memory_df2 = pandas.DataFrame(life_memory)

# rf.fit(memory_df[["observation", "action"]], memory_df["comb_reward"])

# Score
# Much better!
memory_df2.groupby("episode").reward.sum().mean()

0.406

In [18]:
y = .1*memory_df.reward + 1*memory_df.decay_reward + 1*memory_df.tot_reward

# Extension: Pole cart

If time permits, try your hand at pole cart (`env = gym.make('CartPole-v0')`).

Notice that the observation space is quite different. It's no longer discrete--instead we have 4 continuous values. You'll have to store these differently from how you did with Frozenlake.

My random actor actually does surprisingly well (avg ~22). But my intelligent agent is able to score ~99. Can you beat me?


# Pole cart


In [19]:
env = gym.make('CartPole-v1')

In [20]:
# now we can build a toy world!
num_episodes = 1000

life_memory = []
for i in range(num_episodes):

    # start a new episode and record all the memories
    old_observation = env.reset()
    done = False
    tot_reward = 0
    ep_memory = []
    while not done:
        new_action = env.action_space.sample()
        observation, reward, done, info = env.step(new_action)
        tot_reward += reward

        ep_memory.append({
            "obs0": old_observation[0],
            "obs1": old_observation[1],
            "obs2": old_observation[2],
            "obs3": old_observation[3],
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_observation = observation

    # incorporate total reward
    for ep_mem in ep_memory:
        ep_mem["tot_reward"] = tot_reward

    life_memory.extend(ep_memory)

memory_df = pandas.DataFrame(life_memory)

memory_df.groupby("episode").reward.sum().mean()

22.257

In [21]:
memory_df.describe()

Unnamed: 0,obs0,obs1,obs2,obs3,action,reward,episode,tot_reward
count,22257.0,22257.0,22257.0,22257.0,22257.0,22257.0,22257.0,22257.0
mean,0.000323,-0.011769,0.001191,0.020783,0.494811,1.0,503.340387,29.080784
std,0.092939,0.537241,0.091389,0.787966,0.499984,0.0,288.98919,16.804676
min,-0.972825,-2.32756,-0.209421,-2.87683,0.0,1.0,0.0,8.0
25%,-0.041547,-0.366714,-0.051521,-0.492022,0.0,1.0,257.0,17.0
50%,0.000437,-0.004281,0.001792,0.007176,0.0,1.0,503.0,25.0
75%,0.041657,0.34951,0.053877,0.541521,1.0,1.0,748.0,36.0
max,0.982609,2.42426,0.209385,3.050226,1.0,1.0,999.0,104.0


In [22]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor

model = ExtraTreesRegressor(n_estimators=50)

memory_df["comb_reward"] = .5*memory_df.reward + memory_df.tot_reward
model.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)

In [23]:
num_episodes = 100
random_per = 0

life_memory = []
for i in range(num_episodes):

    # start a new episode and record all the memories
    old_observation = env.reset()
    done = False
    tot_reward = 0
    ep_memory = []
    while not done:


        if np.random.rand() < random_per:
            new_action = env.action_space.sample()
        else:
            pred_in = [list(old_observation)+[i] for i in range(2)]
            new_action = np.argmax(model.predict(pred_in))
        observation, reward, done, info = env.step(new_action)
        tot_reward += reward

        ep_memory.append({
            "obs0": old_observation[0],
            "obs1": old_observation[1],
            "obs2": old_observation[2],
            "obs3": old_observation[3],
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_observation = observation

    # incorporate total reward
    for ep_mem in ep_memory:
        ep_mem["tot_reward"] = tot_reward

    life_memory.extend(ep_memory)

memory_df2 = pandas.DataFrame(life_memory)
memory_df2["comb_reward"] = memory_df2.reward + memory_df2.tot_reward

# score
# much better!
memory_df2.groupby("episode").reward.sum().mean()

127.94