In [82]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from torch.optim import Adam, lr_scheduler
from torch.utils.data import DataLoader, TensorDataset

In [83]:
# random seed
torch.manual_seed(42)
np.random.seed(42)


In [84]:
vocab_size = 10
d_model = 512 #128

num_heads = 8 #4
num_layers = 6 #5
dropout = 0.1 #0.3
batch_size = 32 #
len_seq = 6

# 5000 permutacoes de 8 digitos

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cpu


In [85]:
global ALREADY_PRINT
ALREADY_PRINT = 0

class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embedding, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # print(self.vocab_size, self.d_model)
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        # implements the positional encoding function as the size of x
        self.dropout = nn.Dropout(p=dropout)
        max_len = 512
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.dk = d_model // num_heads

        self.WO = nn.Linear(d_model, d_model)
        self.WQ = nn.Linear(d_model, d_model)
        self.WK = nn.Linear(d_model, d_model)
        self.WV = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    def attention(self, Q, K, V, dk, mask=False):
        QKT = torch.matmul(Q, K.transpose(-2, -1))  # output dim: (batch_size, num_heads, n_tokens, n_tokens)
        scaled_dot_product = torch.div(QKT, math.sqrt(dk))

        if mask:
            mask = torch.triu(torch.ones(QKT.size(-2), QKT.size(-1)), diagonal=1).bool().to(device)
            scaled_dot_product = scaled_dot_product.masked_fill(mask, -float('inf'))
            # print("Masked")
            # print(scaled_dot_product)
            # ALREADY_PRINT = 1

        sm = torch.nn.Softmax(dim=-1)
        attention = sm(scaled_dot_product)
        attention = self.dropout(attention)
        return torch.matmul(attention, V)

    def forward(self, Q, K, V, mask=False):
        Q = self.WQ(Q)
        K = self.WK(K)
        V = self.WV(V)

        Q = Q.view(Q.size(0), -1, self.num_heads, self.dk).transpose(1, 2)  # Q: Why -1? A: It's a batch size
        K = K.view(K.size(0), -1, self.num_heads, self.dk).transpose(1, 2)
        V = V.view(V.size(0), -1, self.num_heads, self.dk).transpose(1, 2)

        attention = self.attention(Q, K, V, self.dk, mask)
        attention = attention.transpose(1, 2).contiguous().view(Q.size(0), -1, self.d_model)
        return self.WO(attention)

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )

    def forward(self, x, mask):
        x = x + self.dropout1(self.multi_head_attention(x, x, x, mask))
        x = self.norm1(x)

        x = x + self.dropout2(self.ff(x))
        x = self.norm2(x)
        return x

class Decoder(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dropout=0.1):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return x

class MyTransformerDecoderOnly(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers,  dropout=0.1):
        super(MyTransformerDecoderOnly, self).__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.transpose_embedding = nn.Linear(d_model, vocab_size)
        self.transpose_embedding.weight = self.embedding.embedding.weight
        self.pos_enc = PositionalEncoding(d_model)
        self.decoder = Decoder(d_model, num_heads, num_layers, dropout)
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.decoder(x, mask)
        x = self.linear(x)
        # Transpos of emebedding
        # x = self.transpose_embedding(x)

        return x


# Shakespeare

In [86]:
import re

In [87]:
text = open('shakespeare.txt', 'r').read()


In [88]:
from torch.utils.data import Dataset
class WordDataset(Dataset):
    """
    arrange data and targets so that the first i elements of x
    will be asked to predict the i-th element of y. Notice that
    the eventual language model will actually make block_size
    individual predictions at the same time based on this data,
    so we are being clever and amortizing the cost of the forward
    pass of the network. So for example if block_size is 4, then
    we could e.g. sample a chunk of text "w1 w2 w3 w4 w5", the integers in
    x will correspond to "w1 w2 w3 w4" and in y will be "w2 w3 w4 w5". This will
    then actually "multitask" 4 separate examples at the same time
    in the language model:
    - given just "w1", please predict "w2" as next
    - given "w1 w2" please predict "w3" next
    - given "w1 w2 w3" predict "w4" next
    - given "w1 w2 w3 w4" predict "w5" next
    """
    def __init__(self, data, block_size):
        words = re.split(r"\b", data)
        vocab = sorted(list(set(words)))
        data_size, vocab_size = len(words), len(vocab)
        print('data has %d words, %d unique.' % (data_size, vocab_size))

        self.stoi = {word: i for i, word in enumerate(vocab)}
        self.itos = {i: word for i, word in enumerate(vocab)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = words

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every word to an integer
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [89]:
block_size = 128
train_dataset = WordDataset(text, block_size)

data has 1980893 words, 34230 unique.


In [90]:
batch_size = 64
train_loader = DataLoader(
    train_dataset, shuffle=True, pin_memory=True, batch_size=batch_size
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)


device: cpu


In [91]:
num_layers = 2
num_heads = 2
d_model = 128
vocab_size = train_dataset.vocab_size

model = MyTransformerDecoderOnly(vocab_size,
                                 d_model,
                                 num_heads,
                                 num_layers,
                                 dropout).to(device).train()


loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=6e-4)
max_epochs = 1

scheduler = lr_scheduler.PolynomialLR(optimizer, power=1, total_iters=max_epochs*len(train_dataset))


# for epoch in range(max_epochs):
#     pbar = tqdm(enumerate(train_loader), total=len(train_loader))
#     for it, (x, y) in pbar:
#         x = x.to(device)
#         y = y.to(device)
# #         print(x.shape)
#         optimizer.zero_grad()

#         logits = model(x, True)
#         loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
#         loss.backward()

#         optimizer.step()
#         # scheduler.step()

#         pbar.set_description(f"epoch {epoch} iter {it}: train loss {loss.item():.5f}")

In [92]:
# import gc
# # Save the model
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
# }, 'model_checkpoint.pth')

# # save the model
# torch.save(model, 'model.pth')



In [93]:
# Delete the model
# del model
# gc.collect()
# torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the model to CPU for inference
model = MyTransformerDecoderOnly(vocab_size, d_model, num_heads, num_layers, dropout).to(device).eval()
checkpoint = torch.load('model_checkpoint_jose.pth', map_location='cpu')
model = torch.load('model_jose.pth')


# Optional: Load optimizer state if needed
optimizer = Adam(model.parameters(), lr=6e-4)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


In [94]:
model.eval()
block_size = 128

In [95]:
def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out


@torch.no_grad()
def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
    """
    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
    the sequence, feeding the predictions back into the model each time
    """
    model.eval()
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        logits = model(x_cond, mask=True)
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        # append to the sequence and continue
        x = torch.cat((x, ix), dim=1)

    return x



In [96]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
context = " O God, O God! "
x = torch.tensor([train_dataset.stoi[s] for s in re.split(r"\b", context)], dtype=torch.long)[None,...].to(device)
y = sample(model, x, 5500, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)


 O God, O God! 
      _Bosko
      Varrius
      solemnities!               draws
      Varrius
      entomb
SON
      inside
      Varrius
      entomb
      golden
      shade
      Varrius
      insufficience!
               _Bosko
      Varrius
      insufficienceunpossessingprophetess
      insufficience!
               _Bosko
inside
      _Aside
      maculation!
               Varrius
      insufficience!
               _Bosko
      Varrius
      undistinguish
      unchaste
      frown
      dragon
      maledictions
      Varrius
      insufficience!

           Stayest
      golden
      frown
      forfeiters
      specialties
      snake
      clocks
Strong
      golden
      Varrius
      fees
      insufficience!
               Varrius
      faithfull
LORDS
      Varrius
      solemnities
      Varrius
      insufficienceunpossessingprophetess
      Varrius
      fees
      underborne!
               _Bosko
      Varrius
      insufficience
maledictions
      snake
      

# Classification

In [97]:
output_path = 'classified_sentences.txt'
with open(output_path, 'r', encoding='utf-8') as file:
    classified = eval(file.read())

# Print the classified sentences
for sentence, sentiment in classified[:5]:
    print(f"Sentence: {sentence}")
    if sentiment == 1:
        print("Sentiment: Positive")
    elif sentiment == 2:
        print("Sentiment: Negative")
    else:
        print("Sentiment: Neutral")
    print()

Sentence:  O God, O God! O heavens; that you do not say he is
not.
Sentiment: Positive

Sentence: 
DESDEMONA.
Sentiment: Neutral

Sentence: But now, what is your Grace.
Sentiment: Positive

Sentence: 
EMILIA.
Sentiment: Neutral

Sentence: I have been so good to my mistress.
Sentiment: Positive



Fine-tune your model to obtain a reward model that predicts the sentiment of a sample. You can treat this as a sequence-modeling problem by having a model predict special tokens such as "happy" and "sad" based on the sentiment, treating neutral labels as soft 50% labels. Remember to mask all but the last token of the sample.

In [98]:
import torch
import torch.nn as nn
import math

class SentimentTransformerDecoderOnly(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dropout=0.1):
        super(SentimentTransformerDecoderOnly, self).__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.decoder = Decoder(d_model, num_heads, num_layers, dropout)
        self.linear = nn.Linear(d_model, vocab_size)
        self.sentiment_head = nn.Linear(d_model, 3)  # Predict positive, negative, neutral

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.decoder(x, mask)
        logits = self.linear(x)
        sentiment_logits = self.sentiment_head(x[:, -1, :])  # Use the last token's representation
        return logits, sentiment_logits




In [99]:
import re
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, data, labels, block_size, stoi, itos):
        # words = re.split(r"\b", data)
        self.stoi = stoi
        self.itos = itos
        self.block_size = block_size
        self.vocab_size = len(stoi)
        self.labels = labels
        self.data = data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y


In [100]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

sentences, sentiments = [], []
for sentence, sentiment in classified:
    sentences.append(sentence)
    sentiments.append(sentiment)

# split sentences in train and test
fraction_train = 0.8
num_train = int(len(sentences) * fraction_train)
sentences_train, sentences_test = sentences[:num_train], sentences[num_train:]
sentiments_train, sentiments_test = sentiments[:num_train], sentiments[num_train:]

print(len(sentences_train), len(sentences_test))

# add sentences and labels to dataset
train_dataset = SentimentDataset(sentences_train, sentiments_train, block_size, train_dataset.stoi, train_dataset.itos)
test_dataset = SentimentDataset(sentences_test, sentiments_test, block_size, train_dataset.stoi, train_dataset.itos)

# create data loaders
batch_size = 64
train_loader = DataLoader(
    train_dataset, shuffle=True, pin_memory=True, batch_size=batch_size
)

test_loader = DataLoader(
    test_dataset, shuffle=False, pin_memory=True, batch_size=batch_size
)

196 49


In [101]:

vocab_size_sent = len(train_dataset.stoi)
d_model_sent = 512
num_heads_sent = 8
num_layers_sent = 6
dropout_sent = 0.1

# Instantiate the model
model_sent = SentimentTransformerDecoderOnly(vocab_size_sent, d_model_sent, num_heads_sent, num_layers_sent, dropout_sent)

# Define optimizer and loss criterion
optimizer_sent = torch.optim.Adam(model_sent.parameters(), lr=1e-4)
criterion_sent = nn.CrossEntropyLoss()

# Training loop
model_sent.train()
for epoch in range(1):
    for input_ids_sent, label_sent in train_loader:
        optimizer_sent.zero_grad()

        # Forward pass
        logits_sent, sentiment_logits_sent = model_sent(input_ids_sent, mask=None)

        # Compute loss
        loss_sent = criterion_sent(sentiment_logits_sent, label_sent)

        # Backward pass and optimization
        loss_sent.backward()
        optimizer_sent.step()

        print(f"Epoch: {epoch}, Loss: {loss_sent.item()}")

KeyError: 'How now?'

# PPO

Source: https://github.com/ckkissane/deep_learning_curriculum/blob/master/solutions/6_Reinforcement_Learning.ipynb

In [None]:
! pip install procgen

In [None]:
import os, time
import gym
from gym import spaces
from procgen import ProcgenEnv
import random
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
import numpy as np
import pandas as pd
from collections import deque
from abc import ABC, abstractmethod
import matplotlib.pyplot as plt

In [None]:
def set_global_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

class Storage():

    def __init__(self, obs_shape, num_steps, num_envs, device):
        self.obs_shape = obs_shape
        self.num_steps = num_steps
        self.num_envs = num_envs
        self.device = device
        self.reset()

    def reset(self):
        self.obs_batch = torch.zeros(self.num_steps+1, self.num_envs, *self.obs_shape)
        self.act_batch = torch.zeros(self.num_steps, self.num_envs)
        self.rew_batch = torch.zeros(self.num_steps, self.num_envs)
        self.done_batch = torch.zeros(self.num_steps, self.num_envs)
        self.log_prob_act_batch = torch.zeros(self.num_steps, self.num_envs)
        self.value_batch = torch.zeros(self.num_steps+1, self.num_envs)
        self.return_batch = torch.zeros(self.num_steps, self.num_envs)
        self.adv_batch = torch.zeros(self.num_steps, self.num_envs)
        self.info_batch = deque(maxlen=self.num_steps)
        self.step = 0

    def store(self, obs, act, rew, done, info, log_prob_act, value):
        self.obs_batch[self.step] = torch.from_numpy(obs.copy())
        self.act_batch[self.step] = torch.from_numpy(act.copy())
        self.rew_batch[self.step] = torch.from_numpy(rew.copy())
        self.done_batch[self.step] = torch.from_numpy(done.copy())
        self.log_prob_act_batch[self.step] = torch.from_numpy(log_prob_act.copy())
        self.value_batch[self.step] = torch.from_numpy(value.copy())
        self.info_batch.append(info)

        self.step = (self.step + 1) % self.num_steps

    def store_last(self, last_obs, last_value):
        self.obs_batch[-1] = torch.from_numpy(last_obs.copy())
        self.value_batch[-1] = torch.from_numpy(last_value.copy())

    def compute_estimates(self, gamma=0.99, lmbda=0.95):
        rew_batch = self.rew_batch
        # use_gae
        A = 0
        for i in reversed(range(self.num_steps)):
            rew = rew_batch[i]
            done = self.done_batch[i]
            value = self.value_batch[i]
            next_value = self.value_batch[i+1]

            delta = (rew + gamma * next_value * (1 - done)) - value
            self.adv_batch[i] = A = gamma * lmbda * A * (1 - done) + delta

        self.return_batch = self.adv_batch + self.value_batch[:-1]
        # normalize_adv
        adv_mean = torch.mean(self.adv_batch)
        adv_std = torch.std(self.adv_batch)
        print(f"adv_mean: {adv_mean}")
        print(f"adv_std: {adv_std}")
        self.adv_batch = (self.adv_batch - adv_mean) / (adv_std + 1e-8)


    def fetch_train_generator(self, mini_batch_size=None):
        batch_size = self.num_steps * self.num_envs
        if mini_batch_size is None:
            mini_batch_size = batch_size
        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
                                mini_batch_size,
                                drop_last=True)
        for indices in sampler:
            obs_batch = torch.FloatTensor(self.obs_batch[:-1]).reshape(-1, *self.obs_shape)[indices].to(self.device)
            act_batch = torch.FloatTensor(self.act_batch).reshape(-1)[indices].to(self.device)
            done_batch = torch.FloatTensor(self.done_batch).reshape(-1)[indices].to(self.device)
            log_prob_act_batch = torch.FloatTensor(self.log_prob_act_batch).reshape(-1)[indices].to(self.device)
            value_batch = torch.FloatTensor(self.value_batch[:-1]).reshape(-1)[indices].to(self.device)
            return_batch = torch.FloatTensor(self.return_batch).reshape(-1)[indices].to(self.device)
            adv_batch = torch.FloatTensor(self.adv_batch).reshape(-1)[indices].to(self.device)
            yield obs_batch, act_batch, done_batch, log_prob_act_batch, value_batch, return_batch, adv_batch

    def fetch_log_data(self):
        if 'env_reward' in self.info_batch[0][0]:
            rew_batch = []
            for step in range(self.num_steps):
                infos = self.info_batch[step]
                rew_batch.append([info['env_reward'] for info in infos])
            rew_batch = np.array(rew_batch)
        else:
            rew_batch = self.rew_batch.numpy()
        if 'env_done' in self.info_batch[0][0]:
            done_batch = []
            for step in range(self.num_steps):
                infos = self.info_batch[step]
                done_batch.append([info['env_done'] for info in infos])
            done_batch = np.array(done_batch)
        else:
            done_batch = self.done_batch.numpy()
        return rew_batch, done_batch



## Logger

In [None]:
class Logger(object):

    def __init__(self, n_envs, logdir):
        self.start_time = time.time()
        self.n_envs = n_envs
        self.logdir = logdir

        self.episode_rewards = []
        for _ in range(n_envs):
            self.episode_rewards.append([])
        self.episode_len_buffer = deque(maxlen = 40)
        self.episode_reward_buffer = deque(maxlen = 40)

        self.log = pd.DataFrame(columns = ['timesteps', 'wall_time', 'num_episodes',
                               'max_episode_rewards', 'mean_episode_rewards','min_episode_rewards',
                               'max_episode_len', 'mean_episode_len', 'min_episode_len'])
        self.timesteps = 0
        self.num_episodes = 0

    def feed(self, rew_batch, done_batch):
        steps = rew_batch.shape[0]
        rew_batch = rew_batch.T
        done_batch = done_batch.T

        for i in range(self.n_envs):
            for j in range(steps):
                self.episode_rewards[i].append(rew_batch[i][j])
                if done_batch[i][j]:
                    self.episode_len_buffer.append(len(self.episode_rewards[i]))
                    self.episode_reward_buffer.append(np.sum(self.episode_rewards[i]))
                    self.episode_rewards[i] = []
                    self.num_episodes += 1
        self.timesteps += (self.n_envs * steps)

    def write_summary(self, summary):
        for key, value in summary.items():
            print(f"{key}: {value}")

    def dump(self):
        wall_time = time.time() - self.start_time
        if self.num_episodes > 0:
            episode_statistics = self._get_episode_statistics()
            episode_statistics_list = list(episode_statistics.values())
        else:
            episode_statistics_list = [None] * 6
        log = [self.timesteps] + [wall_time] + [self.num_episodes] + episode_statistics_list
        self.log.loc[len(self.log)] = log

        with open(self.logdir + '/log.csv', 'w') as f:
            self.log.to_csv(f, index = False)
        print(self.log.loc[len(self.log)-1])

    def _get_episode_statistics(self):
        episode_statistics = {}
        episode_statistics['Rewards/max_episodes']  = np.max(self.episode_reward_buffer)
        episode_statistics['Rewards/mean_episodes'] = np.mean(self.episode_reward_buffer)
        episode_statistics['Rewards/min_episodes']  = np.min(self.episode_reward_buffer)
        episode_statistics['Len/max_episodes']  = np.max(self.episode_len_buffer)
        episode_statistics['Len/mean_episodes'] = np.mean(self.episode_len_buffer)
        episode_statistics['Len/min_episodes']  = np.min(self.episode_len_buffer)
        return episode_statistics


## Impala

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        out = F.relu(x)
        out = self.conv1(out)
        out = F.relu(out)
        out = self.conv2(out)
        return out + x

class ImpalaBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.res1 = ResidualBlock(out_channels)
        self.res2 = ResidualBlock(out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = self.maxpool(x)
        x = self.res1(x)
        x = self.res2(x)
        return x

class ImpalaModel(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=32)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(in_features=32 * 8 * 8, out_features=256)

        self.output_dim = 256

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = F.relu(x)
        x = self.flatten(x)
        x = self.fc(x)
        x = F.relu(x)
        return x

In [None]:

class CategoricalPolicy(nn.Module):
    def __init__(self,
                 embedder,
                 action_size):
        """
        embedder: (torch.Tensor) model to extract the embedding for observation
        action_size: number of the categorical actions
        """
        super().__init__()
        self.embedder = embedder
        self.fc_policy = nn.Linear(self.embedder.output_dim, action_size)
        self.fc_value = nn.Linear(self.embedder.output_dim, 1)

    def forward(self, x):
        hidden = self.embedder(x)
        logits = self.fc_policy(hidden)
        p = Categorical(logits=logits)
        v = self.fc_value(hidden).reshape(-1)
        return p, v


<!--  -->

In [None]:
"""
See https://github.com/openai/baselines/tree/master/baselines/common/vec_env
"""

class VecEnv(ABC):
    """
    An abstract asynchronous, vectorized environment.
    Used to batch data from multiple copies of an environment, so that
    each observation becomes an batch of observations, and expected action is a batch of actions to
    be applied per-environment.
    """
    closed = False
    viewer = None

    metadata = {
        'render.modes': ['human', 'rgb_array']
    }

    def __init__(self, num_envs, observation_space, action_space):
        self.num_envs = num_envs
        self.observation_space = observation_space
        self.action_space = action_space

    @abstractmethod
    def reset(self):
        """
        Reset all the environments and return an array of
        observations, or a dict of observation arrays.
        If step_async is still doing work, that work will
        be cancelled and step_wait() should not be called
        until step_async() is invoked again.
        """
        pass

    @abstractmethod
    def step_async(self, actions):
        """
        Tell all the environments to start taking a step
        with the given actions.
        Call step_wait() to get the results of the step.
        You should not call this if a step_async run is
        already pending.
        """
        pass

    @abstractmethod
    def step_wait(self):
        """
        Wait for the step taken with step_async().
        Returns (obs, rews, dones, infos):
         - obs: an array of observations, or a dict of
                arrays of observations.
         - rews: an array of rewards
         - dones: an array of "episode done" booleans
         - infos: a sequence of info objects
        """
        pass

    def close_extras(self):
        """
        Clean up the  extra resources, beyond what's in this base class.
        Only runs when not self.closed.
        """
        pass

    def close(self):
        if self.closed:
            return
        if self.viewer is not None:
            self.viewer.close()
        self.close_extras()
        self.closed = True

    def step(self, actions):
        """
        Step the environments synchronously.
        This is available for backwards compatibility.
        """
        self.step_async(actions)
        return self.step_wait()

    def render(self, mode='human'):
        imgs = self.get_images()
        bigimg = "ARGHH" #tile_images(imgs)
        if mode == 'human':
            self.get_viewer().imshow(bigimg)
            return self.get_viewer().isopen
        elif mode == 'rgb_array':
            return bigimg
        else:
            raise NotImplementedError

    def get_images(self):
        """
        Return RGB images from each environment
        """
        raise NotImplementedError

    @property
    def unwrapped(self):
        if isinstance(self, VecEnvWrapper):
            return self.venv.unwrapped
        else:
            return self

    def get_viewer(self):
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.SimpleImageViewer()
        return self.viewer


class VecEnvWrapper(VecEnv):
    """
    An environment wrapper that applies to an entire batch
    of environments at once.
    """

    def __init__(self, venv, observation_space=None, action_space=None):
        self.venv = venv
        super().__init__(num_envs=venv.num_envs,
                        observation_space=observation_space or venv.observation_space,
                        action_space=action_space or venv.action_space)

    def step_async(self, actions):
        self.venv.step_async(actions)

    @abstractmethod
    def reset(self):
        pass

    @abstractmethod
    def step_wait(self):
        pass

    def close(self):
        return self.venv.close()

    def render(self, mode='human'):
        return self.venv.render(mode=mode)

    def get_images(self):
        return self.venv.get_images()

    def __getattr__(self, name):
        if name.startswith('_'):
            raise AttributeError("attempted to get missing private attribute '{}'".format(name))
        return getattr(self.venv, name)


class VecEnvObservationWrapper(VecEnvWrapper):
    @abstractmethod
    def process(self, obs):
        pass

    def reset(self):
        obs = self.venv.reset()
        return self.process(obs)

    def step_wait(self):
        obs, rews, dones, infos = self.venv.step_wait()
        return self.process(obs), rews, dones, infos


class VecExtractDictObs(VecEnvObservationWrapper):
    def __init__(self, venv, key):
        self.key = key
        super().__init__(venv=venv,
            observation_space=venv.observation_space.spaces[self.key])

    def process(self, obs):
        return obs[self.key]


class RunningMeanStd(object):
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
    def __init__(self, epsilon=1e-4, shape=()):
        self.mean = np.zeros(shape, 'float64')
        self.var = np.ones(shape, 'float64')
        self.count = epsilon

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        self.update_from_moments(batch_mean, batch_var, batch_count)

    def update_from_moments(self, batch_mean, batch_var, batch_count):
        self.mean, self.var, self.count = update_mean_var_count_from_moments(
            self.mean, self.var, self.count, batch_mean, batch_var, batch_count)


def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
    delta = batch_mean - mean
    tot_count = count + batch_count

    new_mean = mean + delta * batch_count / tot_count
    m_a = var * count
    m_b = batch_var * batch_count
    M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
    new_var = M2 / tot_count
    new_count = tot_count

    return new_mean, new_var, new_count


class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """

    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)

        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None

        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        for i in range(len(infos)):
            infos[i]['env_reward'] = rews[i]
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)


class TransposeFrame(VecEnvWrapper):
    def __init__(self, env):
        super().__init__(venv=env)
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(obs_shape[2], obs_shape[0], obs_shape[1]), dtype=np.float32)

    def step_wait(self):
        obs, reward, done, info = self.venv.step_wait()
        return obs.transpose(0,3,1,2), reward, done, info

    def reset(self):
        obs = self.venv.reset()
        return obs.transpose(0,3,1,2)


class ScaledFloatFrame(VecEnvWrapper):
    def __init__(self, env):
        super().__init__(venv=env)
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=obs_shape, dtype=np.float32)

    def step_wait(self):
        obs, reward, done, info = self.venv.step_wait()
        return obs/255.0, reward, done, info

    def reset(self):
        obs = self.venv.reset()
        return obs/255.0

In [None]:
class PPO:
    def __init__(self,
                 env,
                 policy,
                 logger,
                 storage,
                 device,
                 n_steps,
                 n_envs,
                 epoch,
                 mini_batch_per_epoch,
                 mini_batch_size,
                 gamma,
                 lmbda,
                 learning_rate,
                 eps_clip,
                 value_coef,
                 entropy_coef):

        self.env = env
        self.policy = policy
        self.logger = logger
        self.storage = storage
        self.device = device
        self.n_steps = n_steps
        self.n_envs = n_envs
        self.epoch = epoch
        self.mini_batch_per_epoch = mini_batch_per_epoch
        self.mini_batch_size = mini_batch_size
        self.gamma = gamma
        self.lmbda = lmbda
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate, eps=1e-5)
        self.eps_clip = eps_clip
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.t = 0

    def predict(self, obs, done):
        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(device=self.device)
            dist, value = self.policy(obs)
            act = dist.sample()
            log_prob_act = dist.log_prob(act)

        return act.cpu().numpy(), log_prob_act.cpu().numpy(), value.cpu().numpy()

    def optimize(self):
        pi_loss_list, value_loss_list, entropy_loss_list = [], [], []
        self.policy.train()
        for e in range(self.epoch):
            generator = self.storage.fetch_train_generator(mini_batch_size=self.mini_batch_size)
            for sample in generator:
                obs_batch, act_batch, done_batch, \
                    old_log_prob_act_batch, old_value_batch, return_batch, adv_batch = sample
                dist_batch, value_batch = self.policy(obs_batch)

                # Clipped Surrogate Objective
                log_prob_act_batch = dist_batch.log_prob(act_batch)
                ratio = torch.exp(log_prob_act_batch - old_log_prob_act_batch)
                surr1 = ratio * adv_batch
                surr2 = torch.clamp(ratio, 1.0 - self.eps_clip, 1.0 + self.eps_clip) * adv_batch
                pi_loss = -torch.min(surr1, surr2).mean()

                # useful info to log
                approx_kl = (old_log_prob_act_batch - log_prob_act_batch).mean().item()
                clipped = ratio.gt(1+self.eps_clip) | ratio.lt(1-self.eps_clip)
                clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()

                # Clipped Bellman-Error
                clipped_value_batch = old_value_batch + (value_batch - old_value_batch).clamp(-self.eps_clip, self.eps_clip)
                v_surr1 = (value_batch - return_batch).pow(2)
                v_surr2 = (clipped_value_batch - return_batch).pow(2)
                value_loss = 0.5 * torch.max(v_surr1, v_surr2).mean()

                # Policy Entropy
                entropy_loss = dist_batch.entropy().mean()
                loss = pi_loss + self.value_coef * value_loss - self.entropy_coef * entropy_loss
                loss.backward()

                self.optimizer.step()
                self.optimizer.zero_grad()
                pi_loss_list.append(pi_loss.item())
                value_loss_list.append(value_loss.item())
                entropy_loss_list.append(entropy_loss.item())

        summary = {'loss/clipfrac': clipfrac,
                   'loss/approxkl': approx_kl,
                   'loss/policy_loss': np.mean(pi_loss_list),
                   'loss/value_loss': np.mean(value_loss_list),
                   'loss/policy_entropy': np.mean(entropy_loss_list)}
        return summary

    def train(self, num_timesteps):
        obs = self.env.reset()
        done = np.zeros(self.n_envs)

        while self.t < num_timesteps:
            # Run Policy
            self.policy.eval()
            for _ in range(self.n_steps):
                act, log_prob_act, value = self.predict(obs, done)
                next_obs, rew, done, info = self.env.step(act)
                self.storage.store(obs, act, rew, done, info, log_prob_act, value)
                obs = next_obs
            _, _, last_val = self.predict(obs, done)
            self.storage.store_last(obs, last_val)
            # Compute advantage estimates
            self.storage.compute_estimates(self.gamma, self.lmbda)

            # Optimize policy & valueq
            summary = self.optimize()
            # Log the training-procedure
            self.t += self.n_steps * self.n_envs
            rew_batch, done_batch = self.storage.fetch_log_data()
            self.logger.feed(rew_batch, done_batch)
            self.logger.write_summary(summary)
            self.logger.dump()
        self.env.close()