In [1]:
%load_ext autoreload
%autoreload 2

import torch
from rl4co.envs import CVRPTWEnv, DARPEnv, PDPTWEnv
from rl4co.envs.routing import CVRPTWGenerator, DARPGenerator, PDPTWGenerator
from rl4co.models import REINFORCE
from rl4co.utils.trainer import RL4COTrainer
from lightning.pytorch.callbacks import ModelCheckpoint, RichModelSummary
from rl4co.utils.decoding import rollout, random_policy
from rl4co.envs.common import RL4COEnvBase, Generator, get_sampler
from rl4co.models.zoo import AttentionModel, AttentionModelPolicy
from rl4co.utils.ops import gather_by_index, get_tour_length

from ortools_solver import solve_darp_with_ortools


TypeError: unsupported operand type(s) for |: 'type' and 'ABCMeta'

# generate PDPTW instances via PDPTWGenerator

In [None]:
td_init = PDPTWEnv(generator_params={"num_loc":30}).reset(batch_size=[1]).to(device)
td_init
print(solve_darp_with_ortools(td_init, vehicle_speed=5.0, time_limit_seconds=150, max_pdptw_vehicles=6))

In [None]:
print(td_init)

In [None]:
td_init = PDPTWGenerator(num_loc=10)._generate(batch_size=[4])
print("FIELDS:", end=' ')
for key in list(td_init.keys()):
    print(key, end=', ')

for key in list(td_init.keys()):
    print(f"{key} size:  {td_init[key].shape}" )

for key in list(td_init.keys()):
    print(f"{key} content:  {td_init[0][key]}" )

# Vehicle_speed = 5

import torch
from examples.ortools_solver import solve_darp_with_ortools

td = td_init
print(solve_darp_with_ortools(td, vehicle_speed=5.0, time_limit_seconds=10))


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_new = model_ckpt.policy.to(device)
#env = new_model_checkpoint.env.to(device)
env = DARPEnv(generator_params={'num_loc': 30, "num_agents":6}) 

td_init = env.reset(batch_size=[1]).to(device)
out = policy_new(td_init.clone(), env, phase="test", decode_type="greedy")

print(f"Tour lengths: {[f'{-r.item():.2f}' for r in out['reward']]}")
for td, actions in zip(td_init, out['actions'].cpu()):
    env.render(td, actions)
print(out)
print(td_init['time_windows'])


In [None]:
td_init["locs"]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = DARPEnv(generator_params={'num_loc': 10, "num_agents":6}) 

td_init = env.reset(batch_size=[1]).to(device)

or_out = solve_darp_with_ortools(td_init, time_limit_seconds=5)

for td, actions in zip(td_init, or_out['actions'].cpu()):
    env.render(td, actions)
print(or_out)
print(td_init['time_windows'])

In [None]:
out = policy_new(td_init.clone(), env, phase="test", decode_type="greedy")

for td, actions in zip(td_init, out['actions'].cpu()):
    env.render(td, actions)
print(out)
print(td_init['time_windows'])

# Comparison

In [None]:
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
# your pieces: assumed to exist
# from your_module import DARPEnv, solve_darp_with_ortools, policy_new

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_loc_list = [10, 14, 18, 22, 26, 30]   # 6 columns
num_agents_list = [6]            # 4 rows
n_eval = 5                                # increase for smoother numbers

# To store results
pct_table = []

for na in num_agents_list:
    row_vals = []
    for nl in tqdm(num_loc_list):
        # collect rewards over episodes
        or_rewards = []
        pol_rewards = []

        # build env for this setting
        # (if your DARPEnv signature is different, adjust here)
        env = DARPEnv(generator_params={
            "num_loc": nl,
            "num_agents": na,
        })

        for _ in range(n_eval):
            # reset once and send to device
            td_init = env.reset(batch_size=[1]).to(device)

            # OR-Tools solve
            or_out = solve_darp_with_ortools(td_init, time_limit_seconds=10)
            # or_out['reward'] is a tensor([-452.5]) per your example
            or_reward = or_out["reward"].detach().cpu().item()

            # Policy run on the *same* initial state
            pol_out = policy_new(td_init.clone(), env, phase="test", decode_type="greedy")
            pol_reward = pol_out["reward"].detach().cpu().item()

            or_rewards.append(or_reward)
            pol_rewards.append(pol_reward)

        # average
        avg_or = sum(or_rewards) / len(or_rewards)
        avg_pol = sum(pol_rewards) / len(pol_rewards)

        # guard against 0 (shouldn't happen often, but just in case)
        if abs(avg_or) < 1e-6:
            pct_diff = 0.0
        else:
            pct_diff = 100.0 * (avg_pol - avg_or) / abs(avg_or)

        row_vals.append(pct_diff)
    pct_table.append(row_vals)

# make DataFrame: rows = num_agents, cols = num_loc
df = pd.DataFrame(
    pct_table,
    index=[f"agents={na}" for na in num_agents_list],
    columns=[f"loc={nl}" for nl in num_loc_list],
)

print(df)

# seaborn-style 4x6 heatmap
plt.figure(figsize=(12, 6))
ax = sns.heatmap(
    df,
    annot=True,
    fmt=".1f",
    cmap="coolwarm",
    center=0,
    cbar_kws={"label": "% diff (policy vs OR-Tools)"}
)
ax.set_title("Policy vs OR-Tools: % Cost Difference\n")
plt.tight_layout()
plt.show()

In [None]:
or_out