In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.distributions import Categorical, Normal
from torchsummary import summary

import gym
from gym.wrappers import Monitor

import numpy as np


def to_tensor(x):
    return torch.from_numpy(x).float()




In [2]:
class ActorCritic(nn.Module):
    def __init__(self, observation_space, action_space, hidden_layers=[512,256]):
        super().__init__()
        
        self.hidden_layers_dimensions = hidden_layers

        self.model = nn.Sequential(
            nn.Linear(observation_space, hidden_layers[0]),
            nn.ReLU(),
            nn.Linear(hidden_layers[0], hidden_layers[1]),
            nn.ReLU()
        )

        self.policy = nn.Linear(hidden_layers[1], action_space)
        self.value = nn.Linear(hidden_layers[1], 1)
        self.policy_activation = nn.Softmax(action_space)
        
    def forward(self, x):
        l = self.model(x)
        value = self.value(l)
        # Softmax for discrete pleasure
        policy = self.policy(l)
        policy = self.policy_activation(policy)
        return value, policy

In [7]:
class Agent:
    def __init__(self, lr=0.0003, gamma=0.99, environment="CartPole-v1"):
        self.lr = lr
        self.gamma = gamma
        self.env = gym.make(environment)
        self.observation_space = self.env.observation_space.shape[0]
        # TODO - discrete
        self.n_actions = self.env.action_space.n

        #np.random.seed(seed=config['seed'])
        #torch.manual_seed(seed=config['seed'])
        #self.env.seed(config['seed'])
        
        #self.monitor_env = Monitor(self.env, "./videos", force=True, video_callable=lambda episode: True)

        self.actor_critic = ActorCritic(self.observation_space, self.n_actions)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), self.lr)
    
    def save_model(self):
        pass
    
    def load_model(self):
        pass
    
    def choose_action(self, observation):
        state = to_tensor(observation.reshape(-1, 1, self.observation_space))
        _, policy = self.actor_critic(state)

        action_probabilities = Categorical(policy)
        action = action_probabilities.sample()
        log_prob = action_probabilities.log_prob(action)
        self.action = action

        return action.numpy()[0]
    

    def learn(self, state, state_, reward, done):
        state, state_, reward = to_tensor(state), to_tensor(state_), to_tensor(reward)
        state = state.reshape(-1, 1, self.observation_space)
        state_ = state_.reshape(-1, 1, self.observation_space)
        
        state_val, policy = self.actor_critic(state)
        state_val_, _ = self.actor_critic(state_)
        state_val = state_val[0]
        state_val_ = state_val_[0]
        
        # TODO - for different em = Categorical(probs)nv - differenc distribution!!!
        policy_distribution =  Categorical(policy)
        log_policy = policy_distribution.log_prob(self.action)
        
        delta = reward + self.gamma * state_val_ * (1 - int(done)) - state_val
        actor_loss = - log_policy * delta
        critic_loss = delta ** 2
        total_loss = actor_loss + critic_loss
        
        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()
    
    def run(self, epochs=5000, max_steps=128, train=True):
        self.history = {
            "score": [],
        }
        cache = []
        
        for epoch in range(epochs):
            observation = self.env.reset()
            done = False
            rewards = 0
            while not done:
                action = self.choose_action(observation)
                observation_, reward, done, info = self.env.step(action[0])
                rewards += reward
                if train:
                    self.learn(observation, observation_, reward, done)
                observation = observation_
            # Write down history of rewards for each run (scalar)
            self.history["score"].append(rewards)
            cache.append(rewards)
            if not (epoch % 100):
                print(f'Run {epoch}. - reward: {sum(cache)/len(cache)}')
                cache.clear()
        return history

In [None]:
agent = Agent()
agent.run()

Run 0. - reward: 13.0
Run 100. - reward: 41.91
Run 200. - reward: 115.52
Run 300. - reward: 133.53
Run 400. - reward: 170.76
Run 500. - reward: 184.6
Run 600. - reward: 167.06
Run 700. - reward: 211.77
Run 800. - reward: 251.0
Run 900. - reward: 225.34
Run 1000. - reward: 229.88
Run 1100. - reward: 249.51
