In [1]:
import gym
import pandas
import numpy as np

env = gym.make('CartPole-v0')

In [2]:
env.env?

In [127]:
num_episodes = 20000

life_memory = []

for i in range(num_episodes):
    old_observation = env.reset()
    done = False
    tot_reward = 0
    ep_memory = []
    while not done:
        new_action = env.action_space.sample()
        observation, reward, done, info = env.step(new_action)
        tot_reward += reward
        
        ep_memory.append({
            "obs0": old_observation[0],
            "obs1": old_observation[1],
            "obs2": old_observation[2],
            "obs3": old_observation[3],
            "action": new_action,
            "reward": reward,
            "episode": i,
        })
        old_observation = observation
        
    for ep_mem in ep_memory:
        ep_mem["tot_reward"] = tot_reward
        
    life_memory.extend(ep_memory)
    
memory_df = pandas.DataFrame(life_memory)


In [128]:
memory_df.groupby("episode").reward.sum().mean()
memory_df["comb_reward"] = .5*memory_df.reward + memory_df.tot_reward

In [129]:
memory_df

Unnamed: 0,obs0,obs1,obs2,obs3,action,reward,episode,tot_reward,comb_reward
0,0.020155,0.022310,-0.033537,-0.047698,1,1.0,0,37.0,37.5
1,0.020601,0.217897,-0.034491,-0.350770,0,1.0,0,37.0,37.5
2,0.024959,0.023282,-0.041506,-0.069160,1,1.0,0,37.0,37.5
3,0.025424,0.218973,-0.042890,-0.374644,0,1.0,0,37.0,37.5
4,0.029804,0.024486,-0.050383,-0.095787,1,1.0,0,37.0,37.5
...,...,...,...,...,...,...,...,...,...
442627,-0.001106,-0.551411,0.078975,0.917054,0,1.0,19999,10.0,10.5
442628,-0.012134,-0.747506,0.097316,1.233476,0,1.0,19999,10.0,10.5
442629,-0.027084,-0.943735,0.121985,1.554992,1,1.0,19999,10.0,10.5
442630,-0.045959,-0.750268,0.153085,1.302723,0,1.0,19999,10.0,10.5


In [130]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
import datetime

n = datetime.datetime.now
t = n()

RandomForest = RandomForestRegressor()
RandomForest.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)
print(f"Training time: {n() - t}s")

Training time: 0:06:34.983846s


In [35]:
AdaBoost = AdaBoostRegressor()
AdaBoost.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)

AdaBoostRegressor()

In [36]:
ExtraTrees = ExtraTreesRegressor()
ExtraTrees.fit(memory_df[["obs0", "obs1", "obs2", "obs3", "action"]], memory_df.comb_reward)

ExtraTreesRegressor()

In [124]:
t = n()
num_episodes = 100
a_life_memory = []
a_env = gym.make('CartPole-v0')
for i in range(num_episodes):
    a_old_observation = a_env.reset()
    a_done = False
    a_tot_reward = 0
    a_em_memory = []
    
    while not a_done:
        pred_in = [list(a_old_observation) + [i] for i in range(2)]
        a_new_action = np.argmax(AdaBoost.predict(pred_in))
        a_observation, a_reward, a_done, a_info = a_env.step(a_new_action)
        a_tot_reward += a_reward
        
        a_em_memory.append({
            "obs0": a_old_observation[0],
            "obs1": a_old_observation[1],
            "obs2": a_old_observation[2],
            "obs3": a_old_observation[3],
            "action": a_new_action,
            "reward": a_reward,
            "episode": i,
        })
        
        a_old_observation = a_observation

    for a_em in a_em_memory:
        a_em["tot_reward"] = a_tot_reward
        
    a_life_memory.extend(a_em_memory)
a_memory_df = pandas.DataFrame(a_life_memory)
print(f"Training time: {n() - t}s")

Training time: 0:00:01.277858s


In [126]:
a_memory_df.shape

(929, 8)

In [131]:
t = n()
num_episodes = 100
r_life_memory = []
r_env = gym.make('CartPole-v0')
for i in range(num_episodes):
    r_old_observation = r_env.reset()
    r_done = False
    r_tot_reward = 0
    r_em_memory = []
    while not r_done:
        pred_in = [list(r_old_observation) + [i] for i in range(2)]
        r_new_action = np.argmax(RandomForest.predict(pred_in))
        r_observation, r_reward, r_done, r_info = r_env.step(r_new_action)
        r_tot_reward += r_reward
        
        r_em_memory.append({
            "obs0": r_old_observation[0],
            "obs1": r_old_observation[1],
            "obs2": r_old_observation[2],
            "obs3": r_old_observation[3],
            "action": r_new_action,
            "reward": r_reward,
            "episode": i,
        })
        
        r_old_observation = r_observation
        
    for r_em in r_em_memory:
        r_em["tot_reward"] = r_tot_reward
        
    r_life_memory.extend(r_em_memory)
r_memory_df = pandas.DataFrame(r_life_memory)
print(f"Training time: {n() - t}s")

Training time: 0:01:39.444469s


In [132]:
pandas.DataFrame(r_life_memory).shape

(16125, 8)

In [133]:
r_memory_df.groupby("episode").reward.sum().mean()

161.25

In [115]:
t = n()
e_life_memory = []
e_env = gym.make('CartPole-v0')
for i in range(num_episodes):
    e_old_observation = e_env.reset()
    e_done = False
    e_tot_reward = 0
    e_em_memory = []
    while not e_done:
        pred_in = [list(e_old_observation) + [i] for i in range(2)]
        e_new_action = np.argmax(ExtraTrees.predict(pred_in))
        e_observation, e_reward, e_done, e_info = e_env.step(e_new_action)
        e_tot_reward += e_reward
        e_em_memory.append({
            "obs0": e_old_observation[0],
            "obs1": e_old_observation[1],
            "obs2": e_old_observation[2],
            "obs3": e_old_observation[3],
            "action": e_new_action,
            "reward": e_reward,
            "episode": i,
        })
        
        e_old_observation = e_observation

    for e_em in e_em_memory:
        e_em["tot_reward"] = e_tot_reward
    e_life_memory.extend(e_em_memory)
e_memory_df = pandas.DataFrame(e_life_memory)
print(f"Training time: {n() - t}s")

Training time: 0:01:20.116766s


In [108]:
e_env.reset()
e_env.step(1)

(array([ 0.03719052,  0.20216305,  0.01937592, -0.24454289]), 1.0, False, {})

In [109]:
np.argmax(ExtraTrees.predict([[ 0.03719052,  0.20216305,  0.01937592, -0.24454289, 1], [ 0.03719052,  0.20216305,  0.01937592, -0.24454289, 0]]))

0

In [134]:
r_memory_df.groupby("episode").reward.sum().mean()

161.25

In [135]:
(r_memory_df.groupby("episode").reward.sum() >= 200).value_counts()

False    72
True     28
Name: reward, dtype: int64

In [118]:
a_memory_df.groupby("episode").reward.sum().mean()

9.35

In [120]:
(a_memory_df.groupby("episode").reward.sum() >= 200).value_counts()

False    100
Name: reward, dtype: int64

In [117]:
e_memory_df.groupby("episode").reward.sum().mean()

125.45

In [121]:
(e_memory_df.groupby("episode").reward.sum() >= 200).value_counts()

False    88
True     12
Name: reward, dtype: int64