In [2]:
!pip install gym[box2d] tensorboard tensorboardX ptan ignite



In [40]:
!pip install --upgrade jupyterlab-git

Collecting jupyterlab-git
  Downloading jupyterlab_git-0.20.0-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 2.8 MB/s eta 0:00:01
Installing collected packages: jupyterlab-git
  Attempting uninstall: jupyterlab-git
    Found existing installation: jupyterlab-git 0.10.1
    Uninstalling jupyterlab-git-0.10.1:
      Successfully uninstalled jupyterlab-git-0.10.1
Successfully installed jupyterlab-git-0.20.0


In [118]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym
import numpy as np

import ptan
from tensorboardX import SummaryWriter

In [208]:
class Config():

    #NETWORK

    ACTOR_FC1_UNITS = 128
    ACTOR_FC2_UNITS = 64
    LR_ACTOR = 1e-3
    
    GAMMA = 0.99

    device = 'cuda'
    
def calc_qvals(rewards,gamma):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= gamma
        sum_r += r
        res.append(sum_r)
    res = list(reversed(res))
    mean_q = np.mean(res)
    return [q - mean_q for q in res]

In [209]:
class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, fc1_units=400, fc2_units=300):

        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [217]:
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
MAX_EPISODES = 30000
EPISODES_TO_TRAIN = 4
OUTPUT_EVERY=100
NUM_ENV = 4
env_name = 'LunarLander-v2'

In [218]:
env = gym.make(env_name)

In [219]:
config = Config()

In [220]:
writer = SummaryWriter(comment="-lunarlander-reinforce")

In [221]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [222]:
selector = ptan.actions.ProbabilityActionSelector()
model = Actor(state_size,action_size,config.ACTOR_FC1_UNITS,config.ACTOR_FC2_UNITS).to(config.device)
agent = ptan.agent.PolicyAgent(model,action_selector=selector,device=config.device,preprocessor=ptan.agent.float32_preprocessor,apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(env=env,agent=agent,steps_count=1,gamma=config.GAMMA)
optimizer = optim.Adam(model.parameters(), lr=config.LR_ACTOR)

In [223]:
total_rewards = []
step_idx = 0
done_episodes = 0

batch_episodes = 0
batch_states, batch_actions, batch_qvals = [], [], []
cur_states, cur_actions, cur_rewards = [], [], []

for step_idx, exp in enumerate(exp_source):
    cur_states.append(exp.state)
    cur_actions.append(int(exp.action))
    cur_rewards.append(exp.reward)
    
    if exp.last_state is None:
        batch_states.extend(cur_states)
        batch_actions.extend(cur_actions)
        batch_qvals.extend(calc_qvals(cur_rewards,config.GAMMA))
        cur_states.clear()
        cur_actions.clear()
        cur_rewards.clear()
        batch_episodes += 1

    new_rewards = exp_source.pop_total_rewards()
    if new_rewards:
        done_episodes += 1
        reward = new_rewards[0]
        total_rewards.append(reward)
        mean_rewards = float(np.mean(total_rewards[-100:]))
        writer.add_scalar("reward", reward, step_idx)
        writer.add_scalar("reward_100", mean_rewards, step_idx)
        writer.add_scalar("episodes", done_episodes, step_idx)
        
        if done_episodes % OUTPUT_EVERY == 0:
            print("%d: mean_100: %6.2f, episodes: %d" % (step_idx, mean_rewards, done_episodes))
        if mean_rewards > 200:
            print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
            break

        if done_episodes > MAX_EPISODES:
            print("Problem failed to solve!")
            break
                
    if batch_episodes < EPISODES_TO_TRAIN:
        continue

    states_v = torch.FloatTensor(batch_states).to(config.device)
    batch_actions_t = torch.LongTensor(batch_actions).to(config.device)
    batch_qvals_v = torch.FloatTensor(batch_qvals).to(config.device)
    
    optimizer.zero_grad()
    logits_v = model(states_v)
    log_prob_v = F.log_softmax(logits_v, dim=1)
    log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]
    loss_v = -log_prob_actions_v.mean()

    loss_v.backward()
    optimizer.step()
    writer.add_scalar("loss", loss_v, step_idx)
    
    batch_episodes = 0
    batch_states.clear()
    batch_actions.clear()
    batch_qvals.clear()
    
writer.close()

8818: mean_100: -162.29, episodes: 100
18076: mean_100: -148.50, episodes: 200
27728: mean_100: -122.43, episodes: 300
38692: mean_100: -129.83, episodes: 400
52788: mean_100: -138.87, episodes: 500
69167: mean_100: -96.10, episodes: 600
89318: mean_100: -58.78, episodes: 700
112188: mean_100: -67.14, episodes: 800
146637: mean_100: -57.57, episodes: 900
181948: mean_100: -21.03, episodes: 1000
234342: mean_100: -33.20, episodes: 1100
291097: mean_100: -16.79, episodes: 1200
351605: mean_100:  18.86, episodes: 1300
398397: mean_100:  15.83, episodes: 1400
461224: mean_100:   7.50, episodes: 1500
532337: mean_100:  54.91, episodes: 1600
598726: mean_100:  59.03, episodes: 1700
683190: mean_100:  81.82, episodes: 1800
762568: mean_100:  91.09, episodes: 1900
834625: mean_100:  82.61, episodes: 2000
928294: mean_100:  89.89, episodes: 2100
1003570: mean_100:  88.41, episodes: 2200
1083016: mean_100: 100.28, episodes: 2300
1173458: mean_100: 105.70, episodes: 2400
1257873: mean_100:  92.61

In [224]:
torch.save(model.state_dict(), './solved.params')