In [1]:
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy

# from imitation.algorithms.adversarial.airl import AIRL
from IRL_lib_mod.airl import AIRL
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from utils.irl_utils import make_vec_env_robosuite
from utils.demostration_utils import load_dataset_to_trajectories
import os
import h5py
import json
from robosuite.controllers import load_controller_config
from utils.demostration_utils import load_dataset_and_annotations_simutanously
from utils.annotation_utils import read_all_json
from imitation.util import logger as imit_logger
import imitation.scripts.train_adversarial as train_adversarial
import argparse
import robosuite as suite
import torch
from utils.demostration_utils import load_data_to_h5py
from utils.annotation_utils import write_to_json
import time
import matplotlib.pyplot as plt
import numpy as np
import scipy
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

2025-01-29 16:44:25.428741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-29 16:44:25.439199: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-29 16:44:25.442644: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-29 16:44:25.451421: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:

env_name = "Lift"
horizon = 300


notebook_path = os.getcwd()  # Get current working directory (where the notebook is running)
notebook_path = os.path.abspath(notebook_path)  # Get absolute path of the notebook
#- "experiments"
project_path = notebook_path.split("data analysis")[0]
print(project_path)
dataset_path = os.path.join(project_path, "human-demo/lift/low_dim_v141_lift_ph.hdf5")
# Now, proceed with your h5py file operations
f = h5py.File(dataset_path, 'r')
env_meta = json.loads(f["data"].attrs["env_args"])
make_env_kwargs = dict(
    robots="Panda",             # load a Sawyer robot and a Panda robot
    gripper_types="default",                # use default grippers per robot arm
    controller_configs=env_meta["env_kwargs"]["controller_configs"],   # each arm is controlled using OSC
    has_renderer=True,                      # on-screen rendering
    render_camera="frontview",              # visualize the "frontview" camera
    has_offscreen_renderer=True,           # no off-screen rendering
    control_freq=20,                        # 20 hz control for applied actions
    horizon=horizon,                            # each episode terminates after 200 steps
    use_object_obs=True,                   # no observations needed
    use_camera_obs=False,
    reward_shaping=True,
)
SEED = 1

env = suite.make(
    env_name,
    **make_env_kwargs,
)

/home/hang/DHIRL_Progress/learning-with-progress


In [3]:
class CustomLoggingPolicy(MlpPolicy):
    def forward(self, obs: torch.Tensor, deterministic: bool = False):
        global print_cnt
        print_cnt += 1

            # Get the action, value, and log probability from the parent class
        actions, values, log_probs = super().forward(obs, deterministic)
        if print_cnt % 2000 == 0:
            print(f"Actions: {actions[-1].detach().cpu().numpy()}")
                        # Convert actions to NumPy for easier processing
            actions_np = actions.detach().cpu().numpy()
            # Update total actions and count of positive last elements

            positive_last = np.sum(actions_np[:, -1] > 0)
            ratio = positive_last / actions_np.shape[0]
            print(f"Positive ratio: {ratio}")

        # Log the actions (you can adjust the logging as needed)


        # Return the outputs as usual
        return actions, values, log_probs

def evaluate_policy_on_env(env, 
                           exp_name,
                           checkpoint,
                           evaluate_times=10, 
                           render=False):
    custom_objects = {
        'policy_class': CustomLoggingPolicy
    }
    reward_net_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    policy = PPO.load(f"{project_path}/checkpoints/{exp_name}/{checkpoint}/gen_policy/model", custom_objects=custom_objects, device = reward_net_device)
    reward_net = torch.load(f"{project_path}/checkpoints/{exp_name}/{checkpoint}/reward_train.pt", map_location=reward_net_device)
    reward_net.eval()
    reward_net.to(reward_net_device)
    reward_net_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    reward_net.to(reward_net_device)
    global env_name 
    if env_name == "Lift":
        obs_keys = ["cube_pos", "robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
    else:

        obs_keys = ["object-state", "robot0_eef_pos", "robot0_eef_quat", "robot0_gripper_qpos"]
    
    env_rewards = []
    correlations = []
    normalized_pearson_correlations = []
    normalized_spearman_correlations = []
    success_cnt = 0
    SEED = 1
    for i in range(evaluate_times):
        obs = env.reset()
        obs = [obs[key] for key in obs_keys]
        obs = np.concatenate(obs)
        past_action = np.zeros(7)
        done = False
        cnt = 0
        rewards= []
        total_disc_rew = []
        frames = []
        while not done:
            
            #action, _states = policy.predict(obs)
            action, _ = policy.predict(obs, deterministic=True)
            cnt += 1
            if render:
                env.render()
                frame = env.render()
                frames.append(frame)
            obs = torch.tensor(obs).float().unsqueeze(0).to(reward_net_device)
            obs = obs.cpu().detach().numpy()
            # print("obs", obs)   
            
            #action, _ = policy.predict(obs, deterministic=True)
            action = action.squeeze()
            #print(action)
            # if cnt > 200:
            #     action[6] = 1
            #action = action.cpu().detach().numpy().squeeze()

            next_obs, reward, next_done, info = env.step(action)
            
            next_obs = [next_obs[key] for key in obs_keys]
            next_obs = np.concatenate(next_obs)
            # # print(next_obs)
            obs = torch.tensor(obs).float().unsqueeze(0).to(reward_net_device)
            obs_tensor = obs.unsqueeze(0).to(reward_net_device).detach()
            action_tensor = torch.tensor(action).float().unsqueeze(0).to(reward_net_device)
            next_obs_tensor = torch.tensor(next_obs).float().unsqueeze(0).to(reward_net_device)
            done = torch.tensor([0]).float().unsqueeze(0).to(reward_net_device)
            # get the reward from the reward network
            disc_rew = reward_net(obs_tensor, action_tensor, next_obs_tensor, done)
            total_disc_rew.append(disc_rew.item())
            rewards.append(reward)
            # print(type(reward))
            # print(type(disc_rew.item()))
            obs = next_obs
            past_action = action
            #print(f"Discriminator Reward: {disc_rew}")
            # if action[6] > 0:
            #     print(f"gripper action: {action[6]}")
            if render:
                env.render()

                #print("******************Success*********************")
            # print("done", next_done)
            # print("info", info)
            #env.render()
            if next_done:
                #print("yessssssss")
                if obs[2] > 0.84:
                    success_cnt += 1
                break
       # video_path = os.path.join(video_dir, f"episode_{i+1}.mp4")
        
        # print(f"Total Discriminator Reward: {sum(total_disc_rew)}")
        # print(f"Total Reward: {sum(rewards)}")


        
        # Compute correlations
        correlation = scipy.stats.spearmanr(rewards, total_disc_rew)
        # normalized_pearson = scipy.stats.pearsonr(rewards_normalized, total_disc_rew_normalized)
        # normalized_spearman = scipy.stats.spearmanr(rewards_normalized, total_disc_rew_normalized)
        
        #print(f"Correlation (Spearman): {correlation[0]}")
        # print(f"Normalized Pearson Correlation: {normalized_pearson[0]}")
        # print(f"Normalized Spearman Correlation: {normalized_spearman[0]}")
        
        correlations.append(correlation[0])
        # normalized_pearson_correlations.append(normalized_pearson[0])
        # normalized_spearman_correlations.append(normalized_spearman[0])

        env_rewards.append(sum(rewards))

        # imageio.mimwrite(video_path, frames, fps=20, codec='libx264')
        # print(f"Saved video for episode {i+1} at {video_path}")

    print(f"Success Rate: {success_cnt}/{evaluate_times}")
    print(f"Average Reward: {np.mean(env_rewards)}")
    print(f"Average Correlation: {np.mean(correlations)}")
    print(f"reward list: {env_rewards}")
    return success_cnt, env_rewards, correlations, policy, reward_net

In [5]:
checkpoint_start = 100
checkpoint_end = 400
exp_name = "DPHIRL_lift_mh_adv_shaping" 

best_checkpoint_reward = None
highest_score = 0
best_checkpoint_success = None
highest_success = 0
for i in range(checkpoint_start, checkpoint_end, 10):
    print(f"Checkpoint {i}")
    success_cnt, env_rewards, correlations, policy, reward_net = evaluate_policy_on_env(env, exp_name, str(i), evaluate_times=5, render=False)
    if np.mean(env_rewards) > highest_score:
        highest_score = np.mean(env_rewards)
        best_checkpoint_reward = i
    if success_cnt > highest_success:
        highest_success = success_cnt
        best_checkpoint_success = i
    
    print("*************************************************************************************************")


print(f"Best checkpoint based on reward: {best_checkpoint_reward}")
print(f"Best checkpoint based on success: {best_checkpoint_success}")
print("highest_score", highest_score)
print("highest_success", highest_success)



Checkpoint 400


Exception: code expected at most 16 arguments, got 18
Exception: code expected at most 16 arguments, got 18
  reward_net = torch.load(f"{project_path}/checkpoints/{exp_name}/{checkpoint}/reward_train.pt", map_location=reward_net_device)


Success Rate: 0/5
Average Reward: 84.3483989282034
Average Correlation: -0.3734024815410696
reward list: [88.09039829875572, 46.626451149831524, 86.60383100640115, 101.72740745654556, 98.69390672948299]
*************************************************************************************************
Checkpoint 410


Exception: code expected at most 16 arguments, got 18
Exception: code expected at most 16 arguments, got 18


Success Rate: 0/5
Average Reward: 88.44610431709118
Average Correlation: -0.5078838309809661
reward list: [97.6577759462132, 40.33785708726892, 105.85819109302646, 107.58477005068633, 90.79192740826099]
*************************************************************************************************
Checkpoint 420
Success Rate: 0/5
Average Reward: 74.16809215576686
Average Correlation: -0.11155386170957456
reward list: [62.96053075763963, 37.64160591162272, 92.2355409556465, 92.24787338441843, 85.754909769507]
*************************************************************************************************
Checkpoint 430
Success Rate: 0/5
Average Reward: 83.13773346030405
Average Correlation: 0.5530699674440827
reward list: [88.82717862964564, 56.96685251397061, 77.8198490372119, 95.36254538823128, 96.71224173246083]
*************************************************************************************************
Checkpoint 440
Success Rate: 0/5
Average Reward: 73.14373045173889
Averag