In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torchvision import transforms
import matplotlib.pyplot as plt
from collections import deque
import pickle

import numpy as np

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
env_id = "CarRacing-v2"

# Create the env
env = gym.make(env_id, continuous=False, domain_randomize=False)

# Create the evaluation env
eval_env = gym.make(env_id, continuous=False, domain_randomize=False)

# Get the state space and action space
n_actions = env.action_space.n

n_frames = 4

In [5]:
from policy import Policy

In [6]:
MAX_PATIENCE = 100 # Maximum consecutive steps with negative reward 

In [7]:
def evaluate_agent(env, n_eval_episodes, policy):
    episode_rewards = []
    
    for episode in range(n_eval_episodes):
        state = env.reset() # state reset
        
        # perform noop for 60 steps (noisy start)
        for i in range(60):
            state,_,_,_,_ = env.step(0)
            
        
        done = False
        
        # stats
        total_rewards_ep = 0
        negative_reward_patience = MAX_PATIENCE
        
        # state
        states = deque(maxlen=4)
        for i in range(n_frames):
            states.append(state)
            
        while not done:
            # perform action
            action, _ = policy.act(states, exploration=False)
            
            state, reward, done, info, _ = env.step(action)
            states.append(state)
            
            # handle patience
            if reward >=0:
                negative_reward_patience = MAX_PATIENCE
            else:
                negative_reward_patience -= 1
                if negative_reward_patience == 0:
                    done = True
            if done: reward = -100
                    
            # stats
            total_rewards_ep += reward

            if done:
                break
        
        # stats
        episode_rewards.append(total_rewards_ep)
        
    # stats
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

In [8]:
def reinforce(policy, optimizer, n_training_episodes=1000, gamma=0.99, print_every=5):
    # stats
    scores_deque = deque(maxlen=100)
    
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = [] # stores log probs during episode
        rewards = [] # stores rewards during episode
        
        # init episode
        state = env.reset()
        for i in range(60):
            state,_,_,_,_ = env.step(0)
        done = False
        
        negative_reward_patience = MAX_PATIENCE
        states = deque(maxlen=4)
        for i in range(n_frames):
            states.append(state)
            
            
        while not done:
            action, log_prob = policy.act(states)
            
            saved_log_probs.append(log_prob)
            
            state, reward, done, _, _ = env.step(action)
            
            states.append(state)
            
            if reward >=0:
                negative_reward_patience = MAX_PATIENCE
            else:
                negative_reward_patience -= 1
                if negative_reward_patience == 0:
                    done = True
            if done: reward = -100
                    
            rewards.append(reward)
            
            if done:
                break 
        scores_deque.append(sum(rewards))

        
        rewards = np.array(rewards)
        discounts = np.power(gamma, np.arange(len(rewards)))
        
        policy_loss = 0
        for t in range(len(rewards)):
            G = (discounts[:len(rewards)-t]*rewards[t:]).sum()
            policy_loss += -(gamma**t)*G*saved_log_probs[t]
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print(f'''Episode {i_episode}
                    \tAverage Score: {np.mean(scores_deque)}
                    \tLast Score: {rewards.sum()}
                    \tEval Score: {evaluate_agent(eval_env,5,policy)}''')
            torch.save(policy, 'model.pt')


In [9]:
policy = Policy(n_frames, n_actions, 32).to(device)
policy = policy.to(device)

In [10]:
optimizer = optim.Adam(policy.parameters(), lr=1e-3)

In [11]:
reinforce(policy, optimizer)

Episode 5
                    	Average Score: -120.30909357831001
                    	Last Score: -123.01263940520447
                    	Eval Score: (-109.89999999999998, 0.0)
Episode 10
                    	Average Score: -130.6569172758115
                    	Last Score: -157.00000000000125
                    	Eval Score: (-54.133242509156524, 4.381957024199433)
Episode 15
                    	Average Score: -133.82803310232563
                    	Last Score: -117.61627906976737
                    	Eval Score: (-42.71751515599963, 31.300059408389455)
Episode 20
                    	Average Score: -127.90492521185395
                    	Last Score: -88.27647058823555
                    	Eval Score: (-6.868594732419747, 40.91745480573195)
Episode 25
                    	Average Score: -115.79905612204625
                    	Last Score: -54.778494623656385
                    	Eval Score: (-49.23013430140234, 17.685689882852795)


KeyboardInterrupt: 

In [None]:
policy.device
