In [10]:
"""
Simple GFlowNet Training with Variance Targeting
"""
import torch
import torch.optim as optim
import mlflow
import numpy as np
from pathlib import Path

from gfn_environments.single_color_ramp import BlenderTerrainAPI
from gfn_environments.single_color_ramp import (
    ColorRampGFlowNet,
    State,
    ActionRegistry,
    ReplayBuffer,
    sample_trajectory_with_heightmaps
)


def train_variance_targeting_gfn(
    target_variance_min: float = 0.3,
    target_variance_max: float = 0.7,
    num_epochs: int = 1000,
    batch_size: int = 32,
    buffer_capacity: int = 10000,
    learning_rate: float = 1e-3,
    initial_samples: int = 10000,
    eval_interval: int = 100,
    eval_samples: int = 100
):
    """Train GFlowNet to generate terrains with variance in target range"""

    # Initialize
    buffer = ReplayBuffer(capacity=buffer_capacity)
    gfn = ColorRampGFlowNet(hidden_dim=128)
    optimizer = optim.Adam(gfn.parameters(), lr=learning_rate)
    blender_api = BlenderTerrainAPI()

    # Log config
    mlflow.log_params({
        'target_variance_min': target_variance_min,
        'target_variance_max': target_variance_max,
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'buffer_capacity': buffer_capacity,
        'learning_rate': learning_rate,
        'initial_samples': initial_samples,
    })

    print("="*80)
    print(f"Training for variance range: [{target_variance_min:.3f}, {target_variance_max:.3f}]")
    print("="*80 + "\n")

    # ========================================================================
    # PHASE 1: Fill Buffer
    # ========================================================================

    print(f"Phase 1: Filling buffer with {initial_samples} samples...")

    for i in range(initial_samples):
        trajectory, final_state, heightmaps = sample_trajectory_with_heightmaps(
            gfn, blender_api, max_steps=20
        )

        traj_id = buffer.add_trajectory(trajectory, final_state, heightmaps)
        variance = heightmaps[-1].var().item()
        buffer.add_reward(traj_id, 'variance', variance)

        if (i + 1) % 20 == 0:
            print(f"  Progress: {i + 1}/{initial_samples}")

    # Compute baseline
    baseline_variances = [buffer.rewards[r.id]['variance'] for r in buffer.records]
    baseline_in_range = sum(1 for v in baseline_variances
                           if target_variance_min <= v <= target_variance_max)
    baseline_success_rate = baseline_in_range / len(baseline_variances)

    mlflow.log_metrics({
        'buffer/size': len(buffer),
        'buffer/baseline_success_rate': baseline_success_rate,
        'buffer/avg_variance': np.mean(baseline_variances),
    }, step=0)

    print(f"✓ Buffer filled. Baseline success rate: {baseline_success_rate:.2%}\n")

    # ========================================================================
    # PHASE 2: Training Loop
    # ========================================================================

    print(f"Phase 2: Training for {num_epochs} epochs...\n")

    best_success_rate = baseline_success_rate

    for epoch in range(num_epochs):
        # Sample new trajectories and add to buffer
        for _ in range(5):  # Add 5 new samples per epoch
            trajectory, final_state, heightmaps = sample_trajectory_with_heightmaps(
                gfn, blender_api, max_steps=20
            )

            traj_id = buffer.add_trajectory(trajectory, final_state, heightmaps)
            variance = heightmaps[-1].var().item()
            buffer.add_reward(traj_id, 'variance', variance)

        # Compute rewards for all trajectories
        for record in buffer.records:
            traj_id = record.id
            if traj_id in buffer.rewards:
                var = buffer.rewards[traj_id]['variance']

                # Simple reward: 1.0 if in range, decay outside
                if target_variance_min <= var <= target_variance_max:
                    reward = 1.0
                else:
                    distance = min(abs(var - target_variance_min),
                                  abs(var - target_variance_max))
                    reward = max(0.1, np.exp(-distance * 3))

                buffer.add_reward(traj_id, 'reward', reward)

        # Sample batch from buffer
        import random
        all_ids = [record.id for record in buffer.records]
        batch_ids = random.sample(all_ids, min(batch_size, len(all_ids)))

        # Train on batch
        total_loss = 0.0

        for traj_id in batch_ids:
            trajectory = buffer.get_trajectory(traj_id)
            if trajectory is None:
                continue

            reward = buffer.rewards[traj_id].get('reward', 0.5)

            # Forward pass through trajectory
            optimizer.zero_grad()
            traj_loss = 0.0
            state = State()

            for step in trajectory:
                # Get policy logits
                state_tensor = state.to_state_tensor().unsqueeze(0)
                logits = gfn.policy(state_tensor).squeeze(0)

                # Apply action mask
                mask = state.to_action_mask()
                masked_logits = torch.where(mask, logits, torch.tensor(-1e9))
                log_probs = torch.nn.functional.log_softmax(masked_logits, dim=0)

                # Get the action that was taken
                target_action_idx = step['action_idx']

                # Loss: negative log likelihood weighted by reward
                step_loss = -log_probs[target_action_idx] * reward
                traj_loss += step_loss

                # Move to next state
                state = state.apply_action(step['action_name'], step['value_idx'])

            # Backprop
            traj_loss.backward()
            optimizer.step()

            total_loss += traj_loss.item()

        avg_loss = total_loss / len(batch_ids)

        # Log training metrics
        mlflow.log_metrics({
            'train/loss': avg_loss,
            'train/epoch': epoch,
        }, step=epoch)

        # Evaluation
        if (epoch + 1) % eval_interval == 0 or epoch == 0:
            print(f"\nEvaluating at epoch {epoch + 1}...")

            eval_variances = []
            for _ in range(eval_samples):
                trajectory, final_state, heightmaps = sample_trajectory_with_heightmaps(
                    gfn, blender_api, max_steps=20
                )
                variance = heightmaps[-1].var().item()
                eval_variances.append(variance)

            in_range = sum(1 for v in eval_variances
                          if target_variance_min <= v <= target_variance_max)
            success_rate = in_range / len(eval_variances)
            best_success_rate = max(best_success_rate, success_rate)

            improvement = success_rate - baseline_success_rate

            mlflow.log_metrics({
                'eval/success_rate': success_rate,
                'eval/best_success_rate': best_success_rate,
                'eval/improvement': improvement,
                'eval/avg_variance': np.mean(eval_variances),
                'eval/std_variance': np.std(eval_variances),
            }, step=epoch)

            print(f"Epoch {epoch+1:4d}: Loss={avg_loss:.4f}, "
                  f"Success={success_rate:.2%}, Best={best_success_rate:.2%}, "
                  f"Improve={improvement:+.2%}")

    print("\n" + "="*80)
    print(f"Training Complete!")
    print(f"  Best success rate: {best_success_rate:.2%}")
    print(f"  Baseline: {baseline_success_rate:.2%}")
    print(f"  Improvement: {best_success_rate - baseline_success_rate:+.2%}")
    print("="*80)

    return gfn, buffer, best_success_rate


if __name__ == "__main__":
    mlflow.set_experiment("gflownet_variance_targeting")

    with mlflow.start_run(run_name="simple_training"):
        trained_gfn, replay_buffer, best_rate = train_variance_targeting_gfn(
            target_variance_min=0.3,
            target_variance_max=0.7,
            num_epochs=500,
            batch_size=32,
            learning_rate=1e-3,
            initial_samples=10000,
            eval_interval=100,
            eval_samples=100
        )

        print(f"\n✓ Training complete. Best success rate: {best_rate:.2%}")

Initialized GFlowNet:
  Input dimension:  49
  Output dimension: 46
  Hidden dimension: 128
Read blend: "/home/jpleona/jpleona_c/bpygfn/gfn_environments/single_color_ramp.blend"
✓ Loaded template: /home/jpleona/jpleona_c/bpygfn/gfn_environments/single_color_ramp.blend
Training for variance range: [0.300, 0.700]

Phase 1: Filling buffer with 10000 samples...
  Progress: 20/10000
  Progress: 40/10000
  Progress: 60/10000
  Progress: 80/10000
  Progress: 100/10000
  Progress: 120/10000
  Progress: 140/10000
  Progress: 160/10000
  Progress: 180/10000
  Progress: 200/10000
  Progress: 220/10000
  Progress: 240/10000
  Progress: 260/10000
  Progress: 280/10000
  Progress: 300/10000
  Progress: 320/10000
  Progress: 340/10000
  Progress: 360/10000
  Progress: 380/10000
  Progress: 400/10000
  Progress: 420/10000
  Progress: 440/10000
  Progress: 460/10000
  Progress: 480/10000
  Progress: 500/10000
  Progress: 520/10000
  Progress: 540/10000
  Progress: 560/10000
  Progress: 580/10000
  Prog

In [15]:
"""
Simple GFlowNet Training with Variance Targeting
"""
import torch
import torch.optim as optim
import mlflow
import numpy as np

from gfn_environments.single_color_ramp import BlenderTerrainAPI
from gfn_environments.single_color_ramp import (
    ColorRampGFlowNet,
    State,
    ActionRegistry,
    ReplayBuffer,
    sample_trajectory_with_heightmaps,
    load_blend_single_color_ramp
)


def train_variance_targeting_gfn(
    target_variance_min: float = 0.3,
    target_variance_max: float = 0.7,
    num_epochs: int = 1000,
    batch_size: int = 32,
    buffer_capacity: int = 1000,
    learning_rate: float = 1e-3,
    initial_samples: int = 100,
    eval_interval: int = 50,
    eval_samples: int = 20
):
    """Train GFlowNet to generate terrains with variance in target range"""

    # Initialize
    load_blend_single_color_ramp()
    buffer = ReplayBuffer(capacity=buffer_capacity)
    gfn = ColorRampGFlowNet(hidden_dim=128)
    optimizer = optim.Adam(gfn.parameters(), lr=learning_rate)
    blender_api = BlenderTerrainAPI()

    # Log config
    mlflow.log_params({
        'target_variance_min': target_variance_min,
        'target_variance_max': target_variance_max,
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'buffer_capacity': buffer_capacity,
        'learning_rate': learning_rate,
        'initial_samples': initial_samples,
    })

    print("="*80)
    print(f"Training for variance range: [{target_variance_min:.3f}, {target_variance_max:.3f}]")
    print("="*80 + "\n")

    # ========================================================================
    # PHASE 1: Fill Buffer
    # ========================================================================

    print(f"Phase 1: Filling buffer with {initial_samples} samples...")

    for i in range(initial_samples):
        trajectory, final_state, heightmaps = sample_trajectory_with_heightmaps(
            gfn, blender_api, max_steps=20
        )

        traj_id = buffer.add_trajectory(trajectory, final_state, heightmaps)
        variance = heightmaps[-1].var().item()

        # Compute reward immediately
        if target_variance_min <= variance <= target_variance_max:
            reward = 1.0
        else:
            reward = 0.0

        buffer.add_reward(traj_id, 'variance', variance)
        buffer.add_reward(traj_id, 'reward', reward)

        if (i + 1) % 20 == 0:
            print(f"  Progress: {i + 1}/{initial_samples}")

    # Compute baseline
    baseline_variances = [buffer.rewards[r.id]['variance'] for r in buffer.records]
    baseline_in_range = sum(1 for v in baseline_variances
                           if target_variance_min <= v <= target_variance_max)
    baseline_success_rate = baseline_in_range / len(baseline_variances)

    mlflow.log_metrics({
        'buffer/size': len(buffer),
        'buffer/baseline_success_rate': baseline_success_rate,
        'buffer/avg_variance': np.mean(baseline_variances),
    }, step=0)

    print(f"✓ Buffer filled. Baseline success rate: {baseline_success_rate:.2%}\n")

    # ========================================================================
    # PHASE 2: Training Loop
    # ========================================================================

    print(f"Phase 2: Training for {num_epochs} epochs...\n")

    best_success_rate = baseline_success_rate

    for epoch in range(num_epochs):
        # Sample new trajectories and add to buffer (ON-POLICY)
        new_variances = []
        for _ in range(5):  # Add 5 new samples per epoch
            trajectory, final_state, heightmaps = sample_trajectory_with_heightmaps(
                gfn, blender_api, max_steps=20
            )

            traj_id = buffer.add_trajectory(trajectory, final_state, heightmaps)
            variance = heightmaps[-1].var().item()
            new_variances.append(variance)

            # Compute reward
            if target_variance_min <= variance <= target_variance_max:
                reward = 1.0
            else:
                distance = min(abs(variance - target_variance_min),
                              abs(variance - target_variance_max))
                reward = max(0.1, np.exp(-distance * 3))

            buffer.add_reward(traj_id, 'variance', variance)
            buffer.add_reward(traj_id, 'reward', reward)

        # Sample batch from buffer
        import random
        all_ids = [record.id for record in buffer.records]
        batch_ids = random.sample(all_ids, min(batch_size, len(all_ids)))

        # SINGLE OPTIMIZER STEP FOR ENTIRE BATCH
        optimizer.zero_grad()
        batch_loss = 0.0

        for traj_id in batch_ids:
            trajectory = buffer.get_trajectory(traj_id)
            if trajectory is None:
                continue

            reward = buffer.rewards[traj_id].get('reward', 0.5)

            # Forward pass through trajectory
            traj_loss = 0.0
            state = State()

            for step in trajectory:
                # Get policy logits
                state_tensor = state.to_state_tensor().unsqueeze(0)
                logits = gfn.policy(state_tensor).squeeze(0)

                # Apply action mask
                mask = state.to_action_mask()
                masked_logits = torch.where(mask, logits, torch.tensor(-1e9))
                log_probs = torch.nn.functional.log_softmax(masked_logits, dim=0)

                # Get the action that was taken
                target_action_idx = step['action_idx']

                # Loss: negative log likelihood weighted by reward
                step_loss = -log_probs[target_action_idx] * reward
                traj_loss += step_loss

                # Move to next state
                state = state.apply_action(step['action_name'], step['value_idx'])

            batch_loss += traj_loss

        # Single backprop for entire batch
        batch_loss.backward()
        optimizer.step()

        avg_loss = batch_loss.item() / len(batch_ids)

        # Log training metrics
        new_in_range = sum(1 for v in new_variances
                          if target_variance_min <= v <= target_variance_max)
        new_success_rate = new_in_range / len(new_variances) if new_variances else 0

        mlflow.log_metrics({
            'train/loss': avg_loss,
            'train/new_success_rate': new_success_rate,
            'train/avg_new_variance': np.mean(new_variances) if new_variances else 0,
        }, step=epoch)

        # Evaluation
        if (epoch + 1) % eval_interval == 0 or epoch == 0:
            print(f"\nEvaluating at epoch {epoch + 1}...")

            eval_variances = []
            for _ in range(eval_samples):
                trajectory, final_state, heightmaps = sample_trajectory_with_heightmaps(
                    gfn, blender_api, max_steps=20
                )
                variance = heightmaps[-1].var().item()
                eval_variances.append(variance)

            in_range = sum(1 for v in eval_variances
                          if target_variance_min <= v <= target_variance_max)
            success_rate = in_range / len(eval_variances)
            best_success_rate = max(best_success_rate, success_rate)

            improvement = success_rate - baseline_success_rate

            mlflow.log_metrics({
                'eval/success_rate': success_rate,
                'eval/best_success_rate': best_success_rate,
                'eval/improvement': improvement,
                'eval/avg_variance': np.mean(eval_variances),
                'eval/std_variance': np.std(eval_variances),
            }, step=epoch)

            print(f"Epoch {epoch+1:4d}: Loss={avg_loss:.4f}, "
                  f"NewSuccess={new_success_rate:.2%}, "
                  f"EvalSuccess={success_rate:.2%}, Best={best_success_rate:.2%}")

    print("\n" + "="*80)
    print(f"Training Complete!")
    print(f"  Best success rate: {best_success_rate:.2%}")
    print(f"  Baseline: {baseline_success_rate:.2%}")
    print(f"  Improvement: {best_success_rate - baseline_success_rate:+.2%}")
    print("="*80)

    return gfn, buffer, best_success_rate


if __name__ == "__main__":
    mlflow.set_experiment("gflownet_variance_targeting")

    with mlflow.start_run(run_name="simple_training"):
        trained_gfn, replay_buffer, best_rate = train_variance_targeting_gfn(
            target_variance_min=0.3,
            target_variance_max=0.7,
            num_epochs=1000,
            batch_size=32,
            learning_rate=1e-3,
            initial_samples=10000,
            eval_interval=200,
            eval_samples=50
        )

        print(f"\n✓ Training complete. Best success rate: {best_rate:.2%}")

Read blend: "/home/jpleona/jpleona_c/bpygfn/gfn_environments/single_color_ramp.blend"
✓ Loaded template: /home/jpleona/jpleona_c/bpygfn/gfn_environments/single_color_ramp.blend
Initialized GFlowNet:
  Input dimension:  49
  Output dimension: 46
  Hidden dimension: 128
Read blend: "/home/jpleona/jpleona_c/bpygfn/gfn_environments/single_color_ramp.blend"
✓ Loaded template: /home/jpleona/jpleona_c/bpygfn/gfn_environments/single_color_ramp.blend
Training for variance range: [0.300, 0.700]

Phase 1: Filling buffer with 10000 samples...
  Progress: 20/10000
  Progress: 40/10000
  Progress: 60/10000
  Progress: 80/10000
  Progress: 100/10000
  Progress: 120/10000
  Progress: 140/10000
  Progress: 160/10000
  Progress: 180/10000
  Progress: 200/10000
  Progress: 220/10000
  Progress: 240/10000
  Progress: 260/10000
  Progress: 280/10000
  Progress: 300/10000
  Progress: 320/10000
  Progress: 340/10000
  Progress: 360/10000
  Progress: 380/10000
  Progress: 400/10000
  Progress: 420/10000
  Pro