In [None]:
from google.colab import drive
drive.mount('/content/drive')


import os

import torch
os.chdir('/content/drive/MyDrive/ImitationLearning/Invariant-Causal-Imitation-Learning-main/')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# load

In [None]:
!pip install mpi4py 
!pip install box2d-py
!pip install box2d 
!pip3 install gym[Box_2D] 
!pip install gym==0.17.2 -qqq
!pip install numpy~=1.18.2 -qqq
!pip install pandas~=1.0.4 -qqq
!pip install PyYAML~=5.4.1 -qqq
!pip install scikit-learn~=0.22.2 -qqq
!pip install scipy~=1.1.0 -qqq
!pip install stable-baselines~=2.10.1 -qqq
!pip install tensorflow~=1.15.0 -qqq
!pip install torch>=1.6.0 -qqq
!pip install tqdm~=4.32.1 -qqq




#config

In [None]:

config = {
    "ENV": "CartPole-v1",
    "ALG": "BCIRMStudent_Apr17",
    "NUM_TRAJS_GIVEN": 20, #
    "NUM_TRAINING_ENVS": 2,
    "NOISE_DIM": 4,
    "REP_SIZE": 16,
    "TRAJ_SHIFT": 20, # 20,
    "SAMPLING_RATE": 5,
    "NUM_STEPS_TRAIN": 10000,
    "NUM_TRAJS_VALID": 100,
    "NUM_REPETITIONS": 15,
    "BATCH_SIZE": 64,
    "MLP_WIDTHS": 64,
    "ADAM_ALPHA": 1e-3,
    "SGLD_BUFFER_SIZE": 10000,
    "SGLD_LEARN_RATE": 0.01,
    "SGLD_NOISE_COEF": 0.01,
    "SGLD_NUM_STEPS": 100,
    "SGLD_REINIT_FREQ": 0.05,
    "NUM_STEPS_TRAIN_ENERGY_MODEL": 1000,
    'TRIAL': 0
}


config['ENV'] = "LunarLander-v2"
#config['ENV'] = "CartPole-v1"

config['METHOD'] = "BCIRM"
#config['METHOD'] = "BC"



if config['METHOD'] == 'BCIRM':
    config['l2_regularizer_weight'] = 0.001
    config['penalty_weight'] = 10000
    config['penalty_anneal_iters'] = 2500



#testing/il

In [None]:
import argparse
import os
import pickle

import gym
import numpy as np
import pandas as pd
import yaml
import numpy as np

try:
    from paths import get_model_path, get_trajs_path  # noqa
except (ModuleNotFoundError, ImportError):
    from testing.paths import get_model_path, get_trajs_path  # pylint: disable=reimported

from contrib.energy_model import EnergyModel
from contrib.env_wrapper import EnvWrapper, get_test_mult_factors
from network import (
    EnvDiscriminator,
    FeaturesDecoder,
    FeaturesEncoder,
    MineNetwork,
    ObservationsDecoder,
    StudentNetwork,
)
from student import ICILStudent, BCStudent, BCIRMStudent
from testing.train_utils import fill_buffer, make_agent, save_results


  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


# make student

In [None]:
env = gym.make(config["ENV"])
env.action_space.n

4

In [None]:


# pylint: disable=redefined-outer-name
def make_student(run_seed, config):
    env = gym.make(config["ENV"])
    trajs_path = get_trajs_path(config["ENV"], "student_" + config["ALG"], env_id="student", run_seed=run_seed)
    model_path = get_model_path(config["ENV"], "student_" + config["ALG"], run_seed=run_seed)

    state_dim = env.observation_space.shape[0] + config["NOISE_DIM"]
    action_dim = env.action_space.n
    num_training_envs = config["NUM_TRAINING_ENVS"]

    # run_seed = run_seed
    batch_size = config["BATCH_SIZE"]
    teacher = make_agent(config["ENV"], config["EXPERT_ALG"], config["NUM_TRAINING_ENVS"])
    teacher.load_pretrained()

    buffer = fill_buffer(
        trajs_path=teacher.trajs_paths,
        batch_size=batch_size,
        run_seed=run_seed,
        traj_shift=config["TRAJ_SHIFT"],
        buffer_size_in_trajs=config["NUM_TRAJS_GIVEN"],
        sampling_rate=config["SAMPLING_RATE"],
    )

    if buffer.total_size < batch_size:
        batch_size = buffer.total_size



    ##########################      COMMON      ##########################

    print("state_dim", state_dim)

    causal_features_encoder = FeaturesEncoder(
        input_size=state_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"]
    )

    policy_network = StudentNetwork(in_dim=config["REP_SIZE"], out_dim=action_dim, width=config["MLP_WIDTHS"])

    #print("config method = ", config['METHOD'])


    ##########################       BC       #######################

    if config['METHOD'] == 'BC':

        return BCStudent(
            env=env,
            trajs_paths=trajs_path,
            model_path=model_path,
            num_training_envs=num_training_envs,
            teacher=teacher,
            causal_features_encoder=causal_features_encoder,
            policy_network=policy_network,
            buffer=buffer,
            adam_alpha=config["ADAM_ALPHA"],
            config = config
        )


    ##########################       BC IRM       #######################


    elif config['METHOD'] == 'BCIRM':

        return BCIRMStudent(
            env=env,
            trajs_paths=trajs_path,
            model_path=model_path,
            num_training_envs=num_training_envs,
            teacher=teacher,
            causal_features_encoder=causal_features_encoder,
            policy_network=policy_network,
            buffer=buffer,
            adam_alpha=config["ADAM_ALPHA"],
            config = config
        )

    ##########################       ICIL        #######################

    elif config['METHOD'] == 'ICIL':
        energy_model = EnergyModel(
            in_dim=state_dim,
            width=config["MLP_WIDTHS"],
            batch_size=batch_size,
            adam_alpha=config["ADAM_ALPHA"],
            buffer=buffer,
            sgld_buffer_size=config["SGLD_BUFFER_SIZE"],
            sgld_learn_rate=config["SGLD_LEARN_RATE"],
            sgld_noise_coef=config["SGLD_NOISE_COEF"],
            sgld_num_steps=config["SGLD_NUM_STEPS"],
            sgld_reinit_freq=config["SGLD_REINIT_FREQ"],
        )
        energy_model.train(num_updates=config["NUM_STEPS_TRAIN_ENERGY_MODEL"])


        causal_features_decoder = FeaturesDecoder(
            action_size=action_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"]
        )

        observations_decoder = ObservationsDecoder(
            representation_size=config["REP_SIZE"], out_size=state_dim, width=config["MLP_WIDTHS"]
        )


        env_discriminator = EnvDiscriminator(
            representation_size=config["REP_SIZE"], num_envs=config["NUM_TRAINING_ENVS"], width=config["MLP_WIDTHS"]
        )

        noise_features_encoders = [
            FeaturesEncoder(input_size=state_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"])
            for i in range(num_training_envs)
        ]
        noise_features_decoders = [
            FeaturesDecoder(action_size=action_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"])
            for i in range(num_training_envs)
        ]

        print(noise_features_decoders)

        mine_network = MineNetwork(x_dim=config["REP_SIZE"], z_dim=config["REP_SIZE"], width=config["MLP_WIDTHS"])

        return ICILStudent(
            env=env,
            trajs_paths=trajs_path,
            model_path=model_path,
            num_training_envs=num_training_envs,
            teacher=teacher,
            causal_features_encoder=causal_features_encoder,
            noise_features_encoders=noise_features_encoders,
            causal_features_decoder=causal_features_decoder,
            noise_features_decoders=noise_features_decoders,
            observations_decoder=observations_decoder,
            env_discriminator=env_discriminator,
            policy_network=policy_network,
            energy_model=energy_model,
            mine_network=mine_network,
            buffer=buffer,
            adam_alpha=config["ADAM_ALPHA"],
            config = config
        )


def init_arg():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env_name", default="CartPole-v1")
    parser.add_argument("--num_trajectories", default=20, type=int)
    parser.add_argument("--trial", default=0, type=int)
    return parser.parse_args()


#10 Trails -- BCIRM

In [None]:
config['METHOD'] = "BCIRM"

for traj_num in [2, 4, 8, 16, 32, 64, 128]:
    config["NUM_TRAJS_GIVEN"] = traj_num
    config["TRAJ_SHIFT"] = traj_num

    config['ALG'] = "FINAL_BCIRMStudent_replicatedata_trajnum" + str(traj_num)


    ###############.  settings   ###############
    #config['ALG'] = "BCIRMStudent_Apr19_replicatedata"
    #config['METHOD'] = "BCIRM"
    #config['METHOD'] = "ICIL"
    #config["NUM_TRAJS_GIVEN"] = 50
    #config["TRAJ_SHIFT"] = 50
    #config['ENV'] == "CartPole-v1"
    ###############.  settings   ###############


    if config['METHOD'] == 'BCIRM':
        config['l2_regularizer_weight'] = 0.001
        config['penalty_weight'] = 10000
        config['penalty_anneal_iters'] = 5000

    all_results_trail = []

    for trail in range(1): 
        config['TRIAL'] = trail 


        ###############.  start a trail   ###############

        config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
        print("Config: %s" % config)

        TRIAL = config["TRIAL"] #args.trial
        print("Trial number %s" % TRIAL)

        results_dir_base = "testing/results/"
        results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

        config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

        results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
        results_file_path = os.path.join(results_dir, results_file_name)

        if os.path.exists(os.path.join(results_dir, config_file)):
            raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
        pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))




        ###############.  10 runs for each trail   ###############

        print("config method = ", config['METHOD'])
        print("config env = ", config['ENV'])

        for run_seed in range(config["NUM_REPETITIONS"]):
            print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
            student = make_student(run_seed, config)
            student.train(num_updates=config["NUM_STEPS_TRAIN"])

            env_wrapper_out_of_sample = EnvWrapper(
                env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
            )

            env_wrapper_out_of_sample.noise = 0

            action_match, return_mean, return_std = student.test(
                num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
            )

            result = (action_match, return_mean, return_std)
            print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
            save_results(results_file_path, run_seed, action_match, return_mean, return_std)

        results_trial = pd.read_csv(
            "testing/results/"
            + config["ENV"]
            + "/"
            + str(config["NUM_TRAJS_GIVEN"])
            + "/"
            + config["ALG"]
            + "/trial_"
            + str(TRIAL)
            + "_results.csv",
            header=None,
        )

        print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))

        all_results_trail.append(np.mean(results_trial[2].values))
    print("ALL RESULTS TRAIL:" , all_results_trail)




[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
8400 tensor(0.0003, device='cuda:0', grad_fn=<DivBackward0>)
8600 tensor(0.0004, device='cuda:0', grad_fn=<DivBackward0>)
8800 tensor(0.0010, device='cuda:0', grad_fn=<DivBackward0>)
9000 tensor(0.0026, device='cuda:0', grad_fn=<DivBackward0>)
9200 tensor(0.0007, device='cuda:0', grad_fn=<DivBackward0>)
9400 tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)
9600 tensor(0.0004, device='cuda:0', grad_fn=<DivBackward0>)
9800 tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)
10000 tensor(0.0010, device='cuda:0', grad_fn=<DivBackward0>)
###############    Reward for test environment for run 15: -110.50911553111006.   ###############


Average reward for 10 repetitions: -258.62343578818223
ALL RESULTS TRAIL: [-258.62343578818223]
Config: {'ENV': 'LunarLander-v2', 'ALG': 'FINAL_BCIRMStudent_replicatedata_trajnum4', 'NUM_TRAJS_GIVEN': 4, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 4, 'SAMPLING_RATE': 5, 'NUM_STEPS_T

#10 Trails -- BC

In [None]:
config['METHOD'] = "BC"

for traj_num in [2, 4, 8, 16, 32, 64, 128]:
    config["NUM_TRAJS_GIVEN"] = traj_num
    config["TRAJ_SHIFT"] = traj_num


    config['ALG'] = "FINAL_BCStudent_replicatedata_trajnum" + str(traj_num)


    ###############.  settings   ###############
    #config['ALG'] = "BCStudent_Apr19_replicatedata"
    #config['METHOD'] = "BC"
    #config['ENV'] == "CartPole-v1"
    #config['ENV'] == "LunarLander-v2"
    #config["NUM_TRAJS_GIVEN"] = 20
    #config["TRAJ_SHIFT"] = 20
    ###############.  settings   ###############



    if config['METHOD'] == 'BCIRM':
        config['l2_regularizer_weight'] = 0.001
        config['penalty_weight'] = 10000
        config['penalty_anneal_iters'] = 100

    all_results_trail = []

    for trail in range(1): 
        config['TRIAL'] = trail 


        ###############.  start a trail   ###############

        config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
        print("Config: %s" % config)

        TRIAL = config["TRIAL"] #args.trial
        print("Trial number %s" % TRIAL)

        results_dir_base = "testing/results/"
        results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

        config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

        results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
        results_file_path = os.path.join(results_dir, results_file_name)

        if os.path.exists(os.path.join(results_dir, config_file)):
            raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
        pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))




        ###############.  10 runs for each trail   ###############

        print("config method = ", config['METHOD'])
        print("config env = ", config['ENV'])

        for run_seed in range(config["NUM_REPETITIONS"]):
            print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
            student = make_student(run_seed, config)
            student.train(num_updates=config["NUM_STEPS_TRAIN"])

            env_wrapper_out_of_sample = EnvWrapper(
                env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
            )
            action_match, return_mean, return_std = student.test(
                num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
            )

            result = (action_match, return_mean, return_std)
            print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
            save_results(results_file_path, run_seed, action_match, return_mean, return_std)

        results_trial = pd.read_csv(
            "testing/results/"
            + config["ENV"]
            + "/"
            + str(config["NUM_TRAJS_GIVEN"])
            + "/"
            + config["ALG"]
            + "/trial_"
            + str(TRIAL)
            + "_results.csv",
            header=None,
        )

        print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))

        all_results_trail.append(np.mean(results_trial[2].values))

    print("ALL RESULTS TRAIL:" , all_results_trail)

Config: {'ENV': 'LunarLander-v2', 'ALG': 'FINAL_BCStudent_replicatedata_trajnum2', 'NUM_TRAJS_GIVEN': 2, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 2, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'BC', 'l2_regularizer_weight': 0.001, 'penalty_weight': 10000, 'penalty_anneal_iters': 5000, 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  BC
config env =  LunarLander-v2
Run 1 out of 15
state_dim 12
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -49.63886930878617.   ###############


Run 2 out of 15
state_dim 12
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: 

#10 Trails -- ICIL

In [None]:
config['METHOD'] = "ICIL"

for traj_num in [2, 4, 8, 16, 32, 64, 128]:
    config["NUM_TRAJS_GIVEN"] = traj_num
    config["TRAJ_SHIFT"] = traj_num


    config['ALG'] = "FINAL_ICILStudent_replicatedata_trajnum2" + str(traj_num)


    ###############.  settings   ###############
    #config['ALG'] = "ICILStudent_Apr19_replicatedata"
    #config['METHOD'] = "ICIL"
    #config['ENV'] == "CartPole-v1"
    #config['ENV'] == "LunarLander-v2"

    ###############.  settings   ###############



    if config['METHOD'] == 'BCIRM':
        config['l2_regularizer_weight'] = 0.001
        config['penalty_weight'] = 10000
        config['penalty_anneal_iters'] = 100

    all_results_trail = []

    for trail in range(1): 
        config['TRIAL'] = trail 


        ###############.  start a trail   ###############

        config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
        print("Config: %s" % config)

        TRIAL = config["TRIAL"] #args.trial
        print("Trial number %s" % TRIAL)

        results_dir_base = "testing/results/"
        results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

        config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

        results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
        results_file_path = os.path.join(results_dir, results_file_name)

        if os.path.exists(os.path.join(results_dir, config_file)):
            raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
        pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))




        ###############.  10 runs for each trail   ###############

        print("config method = ", config['METHOD'])
        print("config env = ", config['ENV'])

        for run_seed in range(config["NUM_REPETITIONS"]):
            print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
            student = make_student(run_seed, config)
            student.train(num_updates=config["NUM_STEPS_TRAIN"])

            env_wrapper_out_of_sample = EnvWrapper(
                env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
            )
            action_match, return_mean, return_std = student.test(
                num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
            )

            result = (action_match, return_mean, return_std)
            print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
            save_results(results_file_path, run_seed, action_match, return_mean, return_std)

        results_trial = pd.read_csv(
            "testing/results/"
            + config["ENV"]
            + "/"
            + str(config["NUM_TRAJS_GIVEN"])
            + "/"
            + config["ALG"]
            + "/trial_"
            + str(TRIAL)
            + "_results.csv",
            header=None,
        )

        print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))

        all_results_trail.append(np.mean(results_trial[2].values))

    print("ALL RESULTS TRAIL:" , all_results_trail)

Config: {'ENV': 'LunarLander-v2', 'ALG': 'FINAL_ICILStudent_replicatedata_trajnum22', 'NUM_TRAJS_GIVEN': 2, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 2, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'ICIL', 'l2_regularizer_weight': 0.001, 'penalty_weight': 10000, 'penalty_anneal_iters': 2500, 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  ICIL
config env =  LunarLander-v2
Run 1 out of 15
state_dim 12


100%|██████████| 1000/1000 [01:14<00:00, 13.41it/s]


[FeaturesDecoder(
  (layers): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesDecoder(
  (layers): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(

RuntimeError: ignored

#line by line

In [None]:

config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
print("Config: %s" % config)


TRIAL = config["TRIAL"] #args.trial
print("Trial number %s" % TRIAL)


results_dir_base = "testing/results/"
results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])


if not os.path.exists(results_dir):
    os.makedirs(results_dir)



config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
results_file_path = os.path.join(results_dir, results_file_name)

if os.path.exists(os.path.join(results_dir, config_file)):
    raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))


Config: {'ENV': 'CartPole-v1', 'ALG': 'ICILStudent', 'NUM_TRAJS_GIVEN': 10, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 20, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 10, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 8, 'METHOD': 'BC', 'EXPERT_ALG': 'dqn'}
Trial number 8


In [None]:

#"""
if __name__ == "__main__":
   
    print("config method = ", config['METHOD'])
    print("config env = ", config['ENV'])

    for run_seed in range(config["NUM_REPETITIONS"]):
        print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
        student = make_student(run_seed, config)
        student.train(num_updates=config["NUM_STEPS_TRAIN"])

        env_wrapper_out_of_sample = EnvWrapper(
            env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
        )
        action_match, return_mean, return_std = student.test(
            num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
        )

        result = (action_match, return_mean, return_std)
        print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
        save_results(results_file_path, run_seed, action_match, return_mean, return_std)

    results_trial = pd.read_csv(
        "testing/results/"
        + config["ENV"]
        + "/"
        + str(config["NUM_TRAJS_GIVEN"])
        + "/"
        + config["ALG"]
        + "/trial_"
        + str(TRIAL)
        + "_results.csv",
        header=None,
    )

    print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))
#"""

config method =  BC
config env =  CartPole-v1
Run 1 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: 166.1.   ###############


Run 2 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: 155.74.   ###############


Run 3 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: 496.46.   ###############


Run 4 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: 500.0.   ###############


Run 5 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: 500.0.   ###############


Run 6 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: 274.88.   ###############


Run 7 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [None]:

#"""
if __name__ == "__main__":
   
    print("config method = ", config['METHOD'])
    print("config env = ", config['ENV'])

    for run_seed in range(config["NUM_REPETITIONS"]):
        print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
        student = make_student(run_seed, config)
        student.train(num_updates=config["NUM_STEPS_TRAIN"])

        env_wrapper_out_of_sample = EnvWrapper(
            env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
        )
        action_match, return_mean, return_std = student.test(
            num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
        )

        result = (action_match, return_mean, return_std)
        print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
        save_results(results_file_path, run_seed, action_match, return_mean, return_std)

    results_trial = pd.read_csv(
        "testing/results/"
        + config["ENV"]
        + "/"
        + str(config["NUM_TRAJS_GIVEN"])
        + "/"
        + config["ALG"]
        + "/trial_"
        + str(TRIAL)
        + "_results.csv",
        header=None,
    )

    print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))
#"""

config method =  BCIRM
config env =  CartPole-v1
Run 1 out of 10
200 tensor(0.4755, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.4851, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.4670, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.4648, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.4254, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.5061, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.5847, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.4992, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.3855, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.4569, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.5494, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.3457, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.4519, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.3749, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.5179, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.6392, devi

# *** *core* *** generate expert_traj_i 

In [None]:



def get_train_spurcorr_expert_trajs(datafile_name, env_name,  _noise = 0.001,  num_envs = 2):

    from tqdm import tqdm 
    import numpy as np

    def generate_spurcorr_obs(observations, _mult_factor, _noise = 0.001, _idx = 0):
        noise_dims = len(_mult_factor)
        #obs_noise = np.zeros_like(observations)
        #obs_noise[-noise_dims:] = np.random.randn(noise_dims) * _noise
        spur_corr = np.matmul(observations[-noise_dims:], _mult_factor)
        #obs = np.concatenate([observations + obs_noise, spur_corr, [_idx]])
        obs = np.concatenate([observations , spur_corr + np.random.randn(noise_dims) * _noise, [_idx]])
        return obs

    raw = np.load(datafile_name, allow_pickle = True)#[()]#["trajs"]

    obs_dim = len(raw['obs'][0])
    obs_num = len(raw['obs'])

    for expert_num in range(num_envs):
        # print(_mult_factor_multipliers[expert_num])
            
        obs_new = np.zeros(shape = (obs_num, obs_dim *2))

        #_mult_factor = np.diag(np.ones(obs_dim-1)) * _mult_factor_multipliers[expert_num]
        if expert_num ==0:
            _mult_factor = np.diag(np.ones(obs_dim-1))
        elif expert_num ==1:
            _mult_factor = np.ones((obs_dim-1, obs_dim-1)) + np.diag(np.ones(obs_dim-1))


        for i in tqdm(range(obs_num)):
            obs_new[i] = generate_spurcorr_obs(raw['obs'][i], _mult_factor, _noise = 0.001, _idx = expert_num)


        start_index = np.where(raw['episode_starts'] == 1)[0]

        data_block = []
        for i in range(len(start_index)-1):
            slice_idx = np.arange(start_index[i], start_index[i+1])

            obs = obs_new[slice_idx]
            actions = raw['actions'][slice_idx]

            data_list = []
            for j in range(len(slice_idx)):
                data_list.append( (obs[j], actions[j][0], expert_num) )

            data_block.append(data_list)

        data_generated = {'trajs': data_block}  


        isExist = os.path.exists("./volume/" + env_name)
        if not isExist:
            # Create a new directory because it does not exist 
            os.makedirs("./volume/" + env_name)
            print("The new directory for {} is created!".format(env_name))


        np.save("./volume/" + env_name + '/expert_trajs_' + str(expert_num) + '.npy', data_generated)
        print("\n{} saved!".format("./volume/" + env_name + '/expert_trajs_' + str(expert_num) + '.npy'))
        
    #return data_generated

In [None]:
np.ones((obs_dim-1, obs_dim-1)) + np.diag(np.ones(obs_dim-1))

array([[2., 1., 1.],
       [1., 2., 1.],
       [1., 1., 2.]])

In [None]:
import os
os.getcwd()

raw = np.load(datafile_name, allow_pickle = True)
obs_dim = len(raw['obs'][0])
obs_num = len(raw['obs'])

(raw['episode_starts'] == 1).sum()

1000

In [None]:
#datafile_name = "expert_lunarlander_timesteps2e5_episodes10.npz"
#datafile_name = 'expert_cartpole_timesteps2e5_episodes10.npz'
#datafile_name = './contrib/expert_replicate/' + 'expert_cartpole_dqn_replicate_episodes1000.npz'
datafile_name = './contrib/expert_replicate/' + 'expert_lunarlander_ppo2_replicate_episodes1000.npz'



env_name = 'LunarLander-v2'
#env_name = 'CartPole-v1'

get_train_spurcorr_expert_trajs(datafile_name, env_name = env_name,  _noise = 0.001, num_envs = 2)



  0%|          | 0/902005 [00:00<?, ?it/s][A
  0%|          | 3/902005 [00:00<11:15:03, 22.27it/s][A
  0%|          | 5/902005 [00:00<11:38:04, 21.54it/s][A
  0%|          | 8/902005 [00:00<11:35:07, 21.63it/s][A
  0%|          | 11/902005 [00:00<11:29:04, 21.82it/s][A
  0%|          | 14/902005 [00:00<11:27:21, 21.87it/s][A
  0%|          | 17/902005 [00:00<11:26:43, 21.89it/s][A
  0%|          | 19/902005 [00:00<11:52:15, 21.11it/s][A
  0%|          | 22/902005 [00:01<11:48:50, 21.21it/s][A
  0%|          | 25/902005 [00:01<11:44:49, 21.33it/s][A
  0%|          | 28/902005 [00:01<11:48:11, 21.23it/s][A
  0%|          | 31/902005 [00:01<11:55:39, 21.01it/s][A
  0%|          | 34/902005 [00:01<11:56:12, 20.99it/s][A
  0%|          | 37/902005 [00:01<11:51:06, 21.14it/s][A
  0%|          | 40/902005 [00:01<11:56:15, 20.99it/s][A
  0%|          | 43/902005 [00:02<11:47:26, 21.25it/s][A
  0%|          | 46/902005 [00:02<11:44:04, 21.35it/s][A
  0%|          | 49/902005 [

KeyboardInterrupt: ignored

In [None]:
data1 = np.load("./volume/CartPole-v1/expert_trajs_1.npy", allow_pickle = True)#[()]#["trajs"]
for key in data1[()]:
  print(key)

trajs
