In [2]:
import os
os.chdir('../')
os.getcwd()

'e:\\github_clone\\gridGPT'

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim
import grid2op
from src.utils.converter import ActionConverter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ActorCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.affine = nn.Sequential(
            nn.Linear(493, 512), nn.ReLU(),
            nn.Linear(512, 1024), nn.ReLU(),
            nn.Linear(1024, 512), nn.ReLU(),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
        )
        self.action_layer = nn.Linear(256, 178)
        self.value_layer  = nn.Linear(256, 1)

        self.logprobs, self.state_values, self.rewards = [], [], []

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=0.0, nonlinearity='relu')
                nn.init.zeros_(m.bias)

    @torch.no_grad()
    def _sanitize(self, x):
        x = torch.nan_to_num(x, nan=0.0, posinf=1e6, neginf=-1e6)
        return x.clamp_(-1e6, 1e6)

    def forward(self, state):
        # accept numpy or tensor
        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state)
        state = state.to(device=device, dtype=torch.float32)

        h = self.affine(state)                       # already includes ReLU + LayerNorm
        value = self.value_layer(h).squeeze(-1)      # shape: [] or [batch]

        logits = self.action_layer(h)
        # (optional) clip extreme logits for numeric stability
        logits = torch.nan_to_num(logits, nan=0.0, posinf=50.0, neginf=-50.0).clamp(-50.0, 50.0)

        # EITHER: use logits directly (preferred; no softmax warning possible)
        dist = Categorical(logits=logits)

        # OR: keep probs form; if you do, remember dim=-1
        # probs = F.softmax(logits, dim=-1)
        # dist = Categorical(probs=probs)

        action = dist.sample()

        self.logprobs.append(dist.log_prob(action))  # scalar
        self.state_values.append(value)              # tensor (keep for value loss)

        return action.item()
    

    
    def calculateLoss(self, gamma=0.99):
        
        # calculating discounted rewards:
        rewards = []
        dis_reward = 0
        for reward in self.rewards[::-1]:
            dis_reward = reward + gamma * dis_reward
            rewards.insert(0, dis_reward)
                
        # normalizing the rewards:
        rewards = torch.tensor(rewards, device=device)
        rewards = (rewards - rewards.mean()) / (rewards.std())
        
        loss = 0
        for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
            advantage = reward  - value.item()
            action_loss = -logprob * advantage
            value_loss = F.smooth_l1_loss(value, reward)
            loss += (action_loss + value_loss)   
        return loss
    
    def clearMemory(self):
        del self.logprobs[:]
        del self.state_values[:]
        del self.rewards[:]

    def save_checkpoint(self, optimizer:optim=None, filename="actor_critic_checkpoint.pth"):
        """Save model + optimizer for exact training resumption."""
        os.makedirs("models", exist_ok=True)
        checkpoint = {
            'model_state_dict': self.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }
        save_path = os.path.join("models", filename)
        torch.save(checkpoint, save_path)
        print(f"[SAVE] Checkpoint saved to {save_path}")


    def load_checkpoint(self, folder_name=None, filename="actor_critic_checkpoint.pth", optimizer:optim=None, load_optimizer=True):
        """Load model + optimizer state."""
        if folder_name is not None:
            file_path = os.path.join(folder_name, filename)
        else:
            file_path = os.path.join("models", filename)
        if not os.path.exists(file_path):
            print(f"[LOAD] No checkpoint found at {file_path}")
            return False

        checkpoint = torch.load(file_path, map_location=device)
        self.load_state_dict(checkpoint['model_state_dict'])
        if load_optimizer and 'optimizer_state_dict' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"[LOAD] Checkpoint loaded from {file_path}")
        return True

In [5]:
import grid2op
from lightsim2grid import LightSimBackend
from grid2op.Reward import L2RPNSandBoxScore
env = grid2op.make("l2rpn_case14_sandbox",
                   backend=LightSimBackend(), 
                reward_class=L2RPNSandBoxScore,
                #other_rewards={"loss": LossReward, "margin": MarginReward}
               )
converter = ActionConverter(env)

In [None]:
import torch.optim as optim
from pathlib import Path
import csv

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train():
    # Defaults parameters:
    #    gamma = 0.99
    #    lr = 0.02
    #    betas = (0.9, 0.999)
    #    random_seed = 543
    logdir = Path("training_logs")
    logdir.mkdir(parents=True, exist_ok=True)
    rewards_npy = logdir / "episode_rewards.npy"
    steps_npy   = logdir / "episode_steps.npy"
    csv_path    = logdir / "episode_metrics.csv"

    episode_rewards = list(np.load(rewards_npy).astype(float)) if rewards_npy.exists() else []
    episode_steps   = list(np.load(steps_npy).astype(int))     if steps_npy.exists()   else []

    if not csv_path.exists():
        with csv_path.open("w", newline="") as f:
            csv.writer(f).writerow(["episode", "length_steps", "episode_reward"])


    render = False
    gamma = 0.99
    lr = 0.02
    betas = (0.9, 0.999)
    random_seed = 543
    
    torch.manual_seed(random_seed)
    
    policy = ActorCritic().to(device=device)
    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
    print(lr,betas)
    
    running_reward = 0
    for i_episode in range(0, 10000):
        ep_reward = 0.0
        steps = 0
        state = env.reset()

        for t in range(10000):
            action = policy(state.to_vect())
            env_action = converter.act(action)
            state, reward, done, _ = env.step(env_action)

            ep_reward += reward
            steps += 1
            
            policy.rewards.append(reward)
            running_reward += reward
            if done:
                break

            
                    
        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss(gamma)
        loss.backward()
        optimizer.step()        
        policy.clearMemory()
        
        episode_rewards.append(ep_reward)
        episode_steps.append(steps)
        np.save(rewards_npy, np.asarray(episode_rewards, dtype=np.float32))
        np.save(steps_npy,   np.asarray(episode_steps,   dtype=np.int32))
        with csv_path.open("a", newline="") as f:
            csv.writer(f).writerow([i_episode, steps, f"{ep_reward:.6f}"])

        # saving the model if episodes > 999 OR avg reward > 200 
        if i_episode % 100 == 0:
            policy.save_checkpoint(optimizer, filename=f"actor_critic_episode_{i_episode}.pth")
            
        
        
        if i_episode % 10 == 0:
            running_reward = running_reward/20
            print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
            running_reward = 0
    
            
if __name__ == '__main__':
    train()