In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pickle
import os
import yaml
import random
from utils.converter import Converter
from utils.replay_buffer import ReplayBuffer
from agents.network import QNetwork
from utils.data_saver import TrajectoryDataLoader
import grid2op 
from grid2op.Action import TopologyChangeAction

	c:\Users\Ernest\.conda\envs\l2rpn-test\python.exe -m pip install numba



In [53]:
class OfflineQAgent:
    def __init__(self, cfg, env):
        self.cfg = cfg
        self.env = env
        self.converter = Converter(self.env)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayBuffer(self.cfg['MEM_SIZE'], self.cfg['BATCH_SIZE']) 
        self.q_net = QNetwork(self.cfg).to(self.device)
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.cfg['LR'])
        self.losses = []
        self.exploration_rate = self.cfg['EXPLORATION_MAX']


    def choose_action(self, state):
        if random.random() > self.cfg['EPSILON_RATE']:
            return self.converter.convert_env_act_to_one_hot_encoding_act(self.env.action_space.sample().to_vect())
        
        
        state = torch.tensor(state).to(self.device)
        q_val = self.q_net(state)
        return torch.argmax(q_val).item()
    

    def learn(self, states, actions, rewards, states_, dones):
        #states = torch.tensor(states , dtype=torch.float32).to(self.device)
        #actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        #rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        #states_ = torch.tensor(states_, dtype=torch.float32).to(self.device)
        #dones = torch.tensor(dones, dtype=torch.bool).to(self.device)
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        states_ = states_.to(self.device)
        dones = dones.to(self.device)
        batch_indices = np.arange(self.cfg['BATCH_SIZE'], dtype=np.int64)

        q_values = self.q_net(states)
        next_q_values = self.q_net(states_)
        
        predicted_value_of_now = q_values[batch_indices, actions]
        predicted_value_of_future = torch.max(next_q_values, dim=1)[0]
        
        q_target = rewards + self.cfg['GAMMA'] * predicted_value_of_future * dones

        loss = self.q_net.loss(q_target, predicted_value_of_now)
        self.losses.append(loss)
        self.q_net.optimizer.zero_grad()
        loss.backward()
        self.q_net.optimizer.step()

        self.exploration_rate *= self.cfg['EXPLORATION_DECAY']
        self.exploration_rate = max(self.cfg['EXPLORATION_MIN'], self.exploration_rate)


    def returning_epsilon(self):
        return self.exploration_rate
    
    def save_model(self, path):
        os.makedirs(path, exist_ok=True)
        torch.save(self.q_net.state_dict(), os.path.join(path, "offlineDQN.pth"))

    def load_model(self, path):
        torch.load(self.q_net.load_state_dict(path))

In [47]:

def read_yaml_file(file_path):
    with open(file_path, 'r') as file:
        try:
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as e:
            print(f"Error reading YAML file: {e}")

In [48]:
env_name = "rte_case5_example"  # or any other name.
env = grid2op.make(env_name, test=True, action_class=TopologyChangeAction)



In [54]:
data = TrajectoryDataLoader("Data\\trajectory.pkl", batch_size=32)
yaml_data = read_yaml_file("config.yml")

In [55]:
agent = OfflineQAgent(env=env, cfg=yaml_data)
converter = Converter(env)

In [56]:
#score = 0
best_score = 0
agent = OfflineQAgent(env=env, cfg=yaml_data)

for i in range(1, 50):
    print(f"Episode {i}")
    for batch in data:
        states, actions, rewards, next_states, dones = batch
        if states.shape != torch.Size([16, 182]):
            agent.learn(states, actions, rewards, next_states, dones)
    

Episode 1
Episode 2
Episode 3
Episode 4
Episode 5
Episode 6
Episode 7
Episode 8
Episode 9
Episode 10
Episode 11
Episode 12
Episode 13
Episode 14
Episode 15
Episode 16
Episode 17
Episode 18
Episode 19
Episode 20
Episode 21
Episode 22
Episode 23
Episode 24
Episode 25
Episode 26
Episode 27
Episode 28
Episode 29
Episode 30
Episode 31
Episode 32
Episode 33
Episode 34
Episode 35
Episode 36
Episode 37
Episode 38
Episode 39
Episode 40
Episode 41
Episode 42
Episode 43
Episode 44
Episode 45
Episode 46
Episode 47
Episode 48
Episode 49


In [36]:
for i in range(10):
    score = 0
    obs = env.reset()
    while True:
        action = agent.choose_action(obs.to_vect())
        obs, reward, done, _ = env.step(converter.convert_one_hot_encoding_act_to_env_act(converter.int_one_hot(action)))
        #print(reward)
        score+= reward

        if done:
            print(f"Episode {i} Score {score}")
            break

Episode 0 Score 11673.204224586487
Episode 1 Score 11101.55497789383
Episode 2 Score 13711.604459285736
Episode 3 Score 13078.260832071304
Episode 4 Score 13340.005165815353
Episode 5 Score 1758.710616350174
Episode 6 Score 14088.209115505219
Episode 7 Score 13795.545959234238
Episode 8 Score 12642.432779788971
Episode 9 Score 13703.679738044739


In [57]:
from tqdm.notebook import tqdm
import numpy as np
all_obs = []
obs = env.reset()
all_obs.append(obs)
reward = env.reward_range[0]
reward_list = []
done = False
nb_step = 0
print("Very Offline RL Simulation")


with tqdm(total=env.chronics_handler.max_timestep()) as pbar:
    while True:
        action = agent.choose_action(obs.to_vect())
        #action = my_agent.act(obs, reward, done)
        obs, reward, done, _ = env.step(converter.convert_one_hot_encoding_act_to_env_act(converter.int_one_hot(action)))
        reward_list.append(reward)
        pbar.update(1)
        if done:
            break
        all_obs.append(obs)
        nb_step += 1

reward_list_simple_DQN = np.copy(reward_list)

Very Offline RL Simulation


  0%|          | 0/2016 [00:00<?, ?it/s]

In [63]:
agent.save_model("./Agents/OfflineDQN")