In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [2]:
class GenericNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,
                 n_actions):
        super(GenericNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
        self.to(self.device)

    def forward(self, observation):
        state = T.Tensor(observation).to(self.device)
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [3]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,
                 n_actions):
        super(ActorCriticNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.pi = nn.Linear(self.fc2_dims, n_actions)
        self.v = nn.Linear(self.fc2_dims, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
        self.to(self.device)

    def forward(self, observation):
        state = T.Tensor(observation).to(self.device)
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = self.pi(x)
        v = self.v(x)
        return (pi, v)

In [4]:
class Agent(object):
    """ Agent class for use with separate actor and critic networks.
        This is appropriate for very simple environments, such as the mountaincar
    """
    def __init__(self, alpha, beta, input_dims, gamma=0.99,
                 layer1_size=256, layer2_size=256, n_actions=2):
        self.gamma = gamma
        self.actor = GenericNetwork(alpha, input_dims, layer1_size,
                                    layer2_size, n_actions=n_actions)
        self.critic = GenericNetwork(beta, input_dims, layer1_size,
                                     layer2_size, n_actions=1)
        self.log_probs = None

    def choose_action(self, observation):
        probabilities = F.softmax(self.actor.forward(observation))
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        self.log_probs = action_probs.log_prob(action)

        return action.item()

    def learn(self, state, reward, new_state, done):
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()

        critic_value_ = self.critic.forward(new_state)
        critic_value = self.critic.forward(state)
        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)

        delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

        actor_loss = -self.log_probs * delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()

        self.actor.optimizer.step()
        self.critic.optimizer.step()


In [5]:
class NewAgent(object):
    """ Agent class for use with a single actor critic network that shares
        the lowest layers. For use with more complex environments such as
        the discrete lunar lander
    """
    def __init__(self, alpha, input_dims, gamma=0.99,
                 layer1_size=256, layer2_size=256, n_actions=2):
        self.gamma = gamma
        self.actor_critic = ActorCriticNetwork(alpha, input_dims, layer1_size,
                                    layer2_size, n_actions=n_actions)

        self.log_probs = None

    def choose_action(self, observation):
        probabilities, _ = self.actor_critic.forward(observation)
        probabilities = F.softmax(probabilities, dim=-1)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)
        self.log_probs = log_probs

        return action.item()

    def learn(self, state, reward, new_state, done):
        self.actor_critic.optimizer.zero_grad()

        _, critic_value_ = self.actor_critic.forward(new_state)
        _, critic_value = self.actor_critic.forward(state)
        reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)

        delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

        actor_loss = -self.log_probs * delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()

        self.actor_critic.optimizer.step()

In [1]:
import grid2op
from grid2op.Action import TopologyChangeAction
from utils import Converter
import gym
from actor_critic import ACAgent

	c:\Users\Ernest\.conda\envs\l2rpn-test\python.exe -m pip install numba



In [2]:
env = grid2op.make('rte_case5_example', test=True, action_class=TopologyChangeAction)
gym_env = gym.make('LunarLander-v2')



In [3]:
converter = Converter(env)

In [4]:
hp = {'alpha':[0.00001, 0.00003]}
Agent = ACAgent(0.00001, 182, n_actions=132)

In [5]:
a1 = ACAgent(0.00001, 182, n_actions=132)
a1.load_model("AC-1\\actor_critic.pth")

In [5]:
obs = env.reset()
action = Agent.choose_action(obs.to_vect())
action

21

In [6]:
score_history = []
score = 0
num_episodes = 20
for i in range(num_episodes):

    done = False
    score = 0
    observation = env.reset()
    while not done:
        action = Agent.choose_action(observation.to_vect())
        observation_, reward, done, info = env.step(converter.convert_one_hot_encoding_act_to_env_act(converter.int_to_onehot(action)))
        #observation_, reward, done, info,_ = gym_env.step(action)
        #print(action)
        Agent.learn(observation.to_vect(), reward, observation_.to_vect(), done)
        observation = observation_
        score += reward

    score_history.append(score)
    print('episode: ', i,'score: %.2f' % score)


episode:  0 score: 11518.82
episode:  1 score: 13401.44
episode:  2 score: 13272.94
episode:  3 score: 11799.46
episode:  4 score: 12250.94
episode:  5 score: 12723.99
episode:  6 score: 12881.77
episode:  7 score: 13649.13
episode:  8 score: 13455.55
episode:  9 score: 13175.82
episode:  10 score: 12067.46
episode:  11 score: 14021.87
episode:  12 score: 11490.97
episode:  13 score: 11449.31
episode:  14 score: 12895.59
episode:  15 score: 11996.70
episode:  16 score: 12992.36
episode:  17 score: 12015.34
episode:  18 score: 12391.00
episode:  19 score: 12413.68


In [9]:
Agent.save_model("AC-1")

In [6]:
from tqdm.notebook import tqdm
import numpy as np
all_obs = []
obs = env.reset()
all_obs.append(obs)
reward = env.reward_range[0]
reward_list = []
done = False
nb_step = 0
print("Very Simple Actor Critic Simulation")
with tqdm(total=env.chronics_handler.max_timestep()) as pbar:
    while True:
        action = a1.choose_action(obs.to_vect())
        #action = my_agent.act(obs, reward, done)
        obs, reward, done, _ = env.step(converter.convert_one_hot_encoding_act_to_env_act(converter.int_to_onehot(action)))
        reward_list.append(reward)
        pbar.update(1)
        if done:
            break
        all_obs.append(obs)
        nb_step += 1

reward_list_simple_DQN = np.copy(reward_list)

Very Simple Actor Critic Simulation


  0%|          | 0/2016 [00:00<?, ?it/s]