In [1]:
import pybullet_envs
import gym
import numpy as np
from sac_torch import Agent, Agent_sm
import logging
import sys
import os
import matplotlib.pyplot as plt
log_format = '%(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
    format=log_format, datefmt='%m/%d %I:%M:%S %p')
fh = logging.FileHandler(os.path.join('tmp', 'log.txt'))
fh.setFormatter(logging.Formatter(log_format))
logging.getLogger().addHandler(fh)

action_record = []
if __name__ == '__main__':
    env = gym.make('InvertedPendulumBulletEnv-v0')
    #print(env.action_space.shape[0])
    agent = Agent_sm(input_dims=env.observation_space.shape[0], env=env, 
                n_actions=env.action_space.shape[0])
    n_games = 200
    best_score = env.reward_range[0]
    score_history = []
    load_checkpoints = False

    if load_checkpoints:
        agent.load_models()
        #env.render(mode='human')
    
    #save losses: final_loss, value_loss, actor_loss, critic_loss
    final_loss = []
    value_loss = []
    actor_loss = []
    critic_loss = []
    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        step = 0
        while not done:
            action = agent.choose_action(observation)
            action_record.append(action)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(observation, action, reward, observation_, done)
            #if not load_checkpoints:
            l = agent.learn()
            if l is not None:
                final_loss.append(l[0])
                value_loss.append(l[1])
                actor_loss.append(l[2])
                critic_loss.append(l[3])
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

#         if avg_score > best_score:
#             best_score = avg_score
#             #if not load_checkpoints:
#             agent.save_models()
        logging.info('episode %d score %.1f avg score %.1f', i, score, avg_score)



02/28 10:50:05 PM episode 0 score 42.0 avg score 42.0
02/28 10:50:05 PM episode 1 score 16.0 avg score 29.0
02/28 10:50:05 PM episode 2 score 17.0 avg score 25.0
02/28 10:50:05 PM episode 3 score 20.0 avg score 23.8
02/28 10:50:05 PM episode 4 score 36.0 avg score 26.2
02/28 10:50:05 PM episode 5 score 17.0 avg score 24.7
02/28 10:50:05 PM episode 6 score 20.0 avg score 24.0
02/28 10:50:05 PM episode 7 score 28.0 avg score 24.5
02/28 10:50:05 PM episode 8 score 27.0 avg score 24.8
02/28 10:50:05 PM episode 9 score 23.0 avg score 24.6
02/28 10:50:06 PM episode 10 score 22.0 avg score 24.4
02/28 10:50:06 PM episode 11 score 15.0 avg score 23.6
02/28 10:50:07 PM episode 12 score 27.0 avg score 23.8
02/28 10:50:08 PM episode 13 score 17.0 avg score 23.4
02/28 10:50:09 PM episode 14 score 18.0 avg score 23.0
02/28 10:50:09 PM episode 15 score 17.0 avg score 22.6
02/28 10:50:10 PM episode 16 score 17.0 avg score 22.3
02/28 10:50:10 PM episode 17 score 9.0 avg score 21.6
02/28 10:50:12 PM epi

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 10.92 GiB total capacity; 10.36 GiB already allocated; 1.50 MiB free; 10.36 GiB reserved in total by PyTorch)

In [None]:
plt.plot(value_loss)

In [None]:
plt.plot(actor_loss)

In [None]:
plt.plot(critic_loss)

In [None]:
plt.plot(action_record)

In [None]:
#save losses: final_loss, value_loss, actor_loss, critic_loss
n_games_sm = 50
final_loss_sm = []
value_loss_sm = []
actor_loss_sm = []
critic_loss_sm = []
action_record_sm = []
for i in range(n_games_sm):
    observation = env.reset()
    done = False
    score = 0
    step = 0
    while not done:
        action = agent.choose_action(observation)
        action_record_sm.append(action)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.remember(observation, action, reward, observation_, done)
        #if not load_checkpoints:
        l = agent.learn_sm()
        if l is not None:
            final_loss_sm.append(l[0])
            value_loss_sm.append(l[1])
            actor_loss_sm.append(l[2])
            critic_loss_sm.append(l[3])
        observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

#         if avg_score > best_score:
#             best_score = avg_score
#             #if not load_checkpoints:
#             agent.save_models()
    logging.info('episode %d score %.1f avg score %.1f', i, score, avg_score)

In [None]:
plt.plot(actor_loss_sm)

In [None]:
plt.plot(action_record_sm)