In [1]:
import gym
import pandas
import numpy as np

env = gym.make('CartPole-v0')

In [2]:
env.env?

In [3]:
num_episodes = 10000

life_memory = []

for i in range(num_episodes):
    old_observation = env.reset()
    done = False
    tot_reward = 0
    ep_memory = []
    while not done:
        new_action = env.action_space.sample()
        observation, reward, done, info = env.step(new_action)
        tot_reward += reward
        
        ep_memory.append({
            "obs0": old_observation[0],
            "obs1": old_observation[1],
            "obs2": old_observation[2],
            "obs3": old_observation[3],
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_observation = observation
        
    for ep_mem in ep_memory:
        ep_mem["tot_reward"] = tot_reward
        
    life_memory.extend(ep_memory)
    
memory_df = pandas.DataFrame(life_memory)


In [4]:
memory_df.groupby("episode").reward.sum().mean()
memory_df["comb_reward"] = .5*memory_df.reward + memory_df.tot_reward

In [5]:
memory_df

Unnamed: 0,obs0,obs1,obs2,obs3,action,reward,episode,tot_reward,comb_reward
0,0.019188,-0.039022,0.040432,0.025608,1,1.0,0,18.0,18.5
1,0.018408,0.155497,0.040944,-0.254049,1,1.0,0,18.0,18.5
2,0.021518,0.350011,0.035863,-0.533541,0,1.0,0,18.0,18.5
3,0.028518,0.154404,0.025193,-0.229777,0,1.0,0,18.0,18.5
4,0.031606,-0.041069,0.020597,0.070745,0,1.0,0,18.0,18.5
...,...,...,...,...,...,...,...,...,...
223290,0.063689,-0.034803,-0.132928,-0.321163,1,1.0,9999,19.0,19.5
223291,0.062993,0.161937,-0.139352,-0.652635,1,1.0,9999,19.0,19.5
223292,0.066231,0.358696,-0.152404,-0.985750,1,1.0,9999,19.0,19.5
223293,0.073405,0.555494,-0.172119,-1.322159,0,1.0,9999,19.0,19.5


In [6]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
import datetime

n = datetime.datetime.now
t = n()

RandomForest = RandomForestRegressor(n_estimators=50)
AdaBoost = AdaBoostRegressor(n_estimators=50)
ExtraTrees = ExtraTreesRegressor(n_estimators=50)

RandomForest.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)
AdaBoost.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)
ExtraTrees.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)
print(f"Training time: {n() - t}s")

Training time: 0:05:20.702204s


In [7]:
t = n()

num_episodes = 100
r_life_memory = []
a_life_memory = []
e_life_memory = []

r_env = gym.make('CartPole-v0')
for i in range(num_episodes):
    r_old_observation = r_env.reset()
    r_done = False
    r_tot_reward = 0
    r_em_memory = []
    while not r_done:
        pred_in = [list(r_old_observation) + [i] for i in range(2)]
        r_new_action = np.argmax(RandomForest.predict(pred_in))
        r_observation, r_reward, r_done, r_info = r_env.step(r_new_action)
        r_tot_reward += r_reward
        
        r_em_memory.append({
            "obs0": r_old_observation[0],
            "obs1": r_old_observation[1],
            "obs2": r_old_observation[2],
            "obs3": r_old_observation[3],
            "action": r_new_action,
            "reward": r_reward,
            "episode": i,
        })
        
        r_old_observation = r_observation
        
    for r_em in r_em_memory:
        r_em["tot_reward"] = r_tot_reward
        
    r_life_memory.extend(r_em_memory)

a_env = gym.make('CartPole-v0')
for i in range(num_episodes):
    a_old_observation = a_env.reset()
    a_done = False
    a_tot_reward = 0
    a_em_memory = []
    while not a_done:
        pred_in = [list(a_old_observation) + [i] for i in range(2)]
        a_new_action = np.argmax(AdaBoost.predict(pred_in))
        a_observation, a_reward, a_done, a_info = a_env.step(a_new_action)
        a_tot_reward += a_reward
        
        a_em_memory.append({
            "obs0": a_old_observation[0],
            "obs1": a_old_observation[1],
            "obs2": a_old_observation[2],
            "obs3": a_old_observation[3],
            "action": a_new_action,
            "reward": a_reward,
            "episode": i,
        })
        
        a_old_observation = a_observation

    for a_em in a_em_memory:
        a_em["tot_reward"] = a_tot_reward
        
    a_life_memory.extend(a_em_memory)

e_env = gym.make('CartPole-v0')
for i in range(num_episodes):
    e_old_observation = e_env.reset()
    e_done = False
    e_tot_reward = 0
    e_em_memory = []
    while not r_done:
        pred_in = [list(e_old_observation) + [i] for i in range(2)]
        e_new_action = np.argmax(ExtraTrees.predict(pred_in))
        e_observation, e_reward, e_done, e_info = e_env.step(e_new_action)
        e_tot_reward += e_reward
        e_em_memory.append({
            "obs0": e_old_observation[0],
            "obs1": e_old_observation[1],
            "obs2": e_old_observation[2],
            "obs3": e_old_observation[3],
            "action": e_new_action,
            "reward": e_reward,
            "episode": i,
        })
        
        e_old_observation = e_observation

    for e_em in e_em_memory:
        e_em["tot_reward"] = e_tot_reward
        
    e_life_memory.extend(e_em_memory)
    
r_memory_df = pandas.DataFrame(r_life_memory)
a_memory_df = pandas.DataFrame(a_life_memory)
e_memory_df = pandas.DataFrame(e_life_memory)

print(f"Training time: {n() - t}s")

Training time: 0:02:20.186101s


In [8]:
r_memory_df.groupby("episode").reward.sum().mean()

126.59

In [9]:
a_memory_df.groupby("episode").reward.sum().mean()

38.49

In [13]:
e_memory_df

In [10]:
e_memory_df.groupby("episode").reward.sum().mean()

KeyError: 'episode'