In [1]:
from google.colab import drive
drive.mount('/content/drive')


import os

import torch
os.chdir('/content/drive/MyDrive/ImitationLearning/Invariant-Causal-Imitation-Learning-main/')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# load

In [2]:
!pip install mpi4py 
!pip install box2d-py
!pip install box2d 
!pip3 install gym[Box_2D] 
!pip install gym==0.17.2 -qqq
!pip install numpy~=1.18.2 -qqq
!pip install pandas~=1.0.4 -qqq
!pip install PyYAML~=5.4.1 -qqq
!pip install scikit-learn~=0.22.2 -qqq
!pip install scipy~=1.1.0 -qqq
!pip install stable-baselines~=2.10.1 -qqq
!pip install tensorflow~=1.15.0 -qqq
!pip install torch>=1.6.0 -qqq
!pip install tqdm~=4.32.1 -qqq




#config

In [3]:

config = {
    "ENV": "CartPole-v1",
    "ALG": "BCIRMStudent_Apr17",
    "NUM_TRAJS_GIVEN": 20, #
    "NUM_TRAINING_ENVS": 2,
    "NOISE_DIM": 4,
    "REP_SIZE": 16,
    "TRAJ_SHIFT": 20, # 20,
    "SAMPLING_RATE": 5,
    "NUM_STEPS_TRAIN": 10000,
    "NUM_TRAJS_VALID": 100,
    "NUM_REPETITIONS": 15,
    "BATCH_SIZE": 64,
    "MLP_WIDTHS": 64,
    "ADAM_ALPHA": 1e-3,
    "SGLD_BUFFER_SIZE": 10000,
    "SGLD_LEARN_RATE": 0.01,
    "SGLD_NOISE_COEF": 0.01,
    "SGLD_NUM_STEPS": 100,
    "SGLD_REINIT_FREQ": 0.05,
    "NUM_STEPS_TRAIN_ENERGY_MODEL": 1000,
    'TRIAL': 0
}


#config['ENV'] = "LunarLander-v2"
#config['ENV'] = "CartPole-v1"
config['ENV'] = "Acrobot-v1"

config['METHOD'] = "BCIRM"
#config['METHOD'] = "BC"




#testing/il

In [4]:
import argparse
import os
import pickle

import gym
import numpy as np
import pandas as pd
import yaml
import numpy as np

try:
    from paths import get_model_path, get_trajs_path  # noqa
except (ModuleNotFoundError, ImportError):
    from testing.paths import get_model_path, get_trajs_path  # pylint: disable=reimported

from contrib.energy_model import EnergyModel
from contrib.env_wrapper import EnvWrapper, get_test_mult_factors
from network import (
    EnvDiscriminator,
    FeaturesDecoder,
    FeaturesEncoder,
    MineNetwork,
    ObservationsDecoder,
    StudentNetwork,
)
from student import ICILStudent, BCStudent, BCIRMStudent
from testing.train_utils import fill_buffer, make_agent, save_results


  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


# make student

In [5]:


# pylint: disable=redefined-outer-name
def make_student(run_seed, config):
    env = gym.make(config["ENV"])
    trajs_path = get_trajs_path(config["ENV"], "student_" + config["ALG"], env_id="student", run_seed=run_seed)
    model_path = get_model_path(config["ENV"], "student_" + config["ALG"], run_seed=run_seed)

    state_dim = env.observation_space.shape[0] + config["NOISE_DIM"]
    action_dim = env.action_space.n
    num_training_envs = config["NUM_TRAINING_ENVS"]

    # run_seed = run_seed
    batch_size = config["BATCH_SIZE"]
    teacher = make_agent(config["ENV"], config["EXPERT_ALG"], config["NUM_TRAINING_ENVS"])
    teacher.load_pretrained()

    buffer = fill_buffer(
        trajs_path=teacher.trajs_paths,
        batch_size=batch_size,
        run_seed=run_seed,
        traj_shift=config["TRAJ_SHIFT"],
        buffer_size_in_trajs=config["NUM_TRAJS_GIVEN"],
        sampling_rate=config["SAMPLING_RATE"],
    )

    if buffer.total_size < batch_size:
        batch_size = buffer.total_size



    ##########################      COMMON      ##########################

    print("state_dim", state_dim)

    causal_features_encoder = FeaturesEncoder(
        input_size=state_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"]
    )

    policy_network = StudentNetwork(in_dim=config["REP_SIZE"], out_dim=action_dim, width=config["MLP_WIDTHS"])

    #print("config method = ", config['METHOD'])


    ##########################       BC       #######################

    if config['METHOD'] == 'BC':

        return BCStudent(
            env=env,
            trajs_paths=trajs_path,
            model_path=model_path,
            num_training_envs=num_training_envs,
            teacher=teacher,
            causal_features_encoder=causal_features_encoder,
            policy_network=policy_network,
            buffer=buffer,
            adam_alpha=config["ADAM_ALPHA"],
            config = config
        )


    ##########################       BC IRM       #######################


    elif config['METHOD'] == 'BCIRM':

        return BCIRMStudent(
            env=env,
            trajs_paths=trajs_path,
            model_path=model_path,
            num_training_envs=num_training_envs,
            teacher=teacher,
            causal_features_encoder=causal_features_encoder,
            policy_network=policy_network,
            buffer=buffer,
            adam_alpha=config["ADAM_ALPHA"],
            config = config
        )

    ##########################       ICIL        #######################

    elif config['METHOD'] == 'ICIL':
        energy_model = EnergyModel(
            in_dim=state_dim,
            width=config["MLP_WIDTHS"],
            batch_size=batch_size,
            adam_alpha=config["ADAM_ALPHA"],
            buffer=buffer,
            sgld_buffer_size=config["SGLD_BUFFER_SIZE"],
            sgld_learn_rate=config["SGLD_LEARN_RATE"],
            sgld_noise_coef=config["SGLD_NOISE_COEF"],
            sgld_num_steps=config["SGLD_NUM_STEPS"],
            sgld_reinit_freq=config["SGLD_REINIT_FREQ"],
        )
        energy_model.train(num_updates=config["NUM_STEPS_TRAIN_ENERGY_MODEL"])


        causal_features_decoder = FeaturesDecoder(
            action_size=action_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"]
        )

        observations_decoder = ObservationsDecoder(
            representation_size=config["REP_SIZE"], out_size=state_dim, width=config["MLP_WIDTHS"]
        )


        env_discriminator = EnvDiscriminator(
            representation_size=config["REP_SIZE"], num_envs=config["NUM_TRAINING_ENVS"], width=config["MLP_WIDTHS"]
        )

        noise_features_encoders = [
            FeaturesEncoder(input_size=state_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"])
            for i in range(num_training_envs)
        ]
        noise_features_decoders = [
            FeaturesDecoder(action_size=action_dim, representation_size=config["REP_SIZE"], width=config["MLP_WIDTHS"])
            for i in range(num_training_envs)
        ]

        mine_network = MineNetwork(x_dim=config["REP_SIZE"], z_dim=config["REP_SIZE"], width=config["MLP_WIDTHS"])

        return ICILStudent(
            env=env,
            trajs_paths=trajs_path,
            model_path=model_path,
            num_training_envs=num_training_envs,
            teacher=teacher,
            causal_features_encoder=causal_features_encoder,
            noise_features_encoders=noise_features_encoders,
            causal_features_decoder=causal_features_decoder,
            noise_features_decoders=noise_features_decoders,
            observations_decoder=observations_decoder,
            env_discriminator=env_discriminator,
            policy_network=policy_network,
            energy_model=energy_model,
            mine_network=mine_network,
            buffer=buffer,
            adam_alpha=config["ADAM_ALPHA"],
            config = config
        )


def init_arg():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env_name", default="CartPole-v1")
    parser.add_argument("--num_trajectories", default=20, type=int)
    parser.add_argument("--trial", default=0, type=int)
    return parser.parse_args()


#10 Trails -- BCIRM

In [7]:
config['METHOD'] = "BCIRM"

for traj_num in [1,2,3,4,5,6,7,8,9,10, 20]:
    config["NUM_TRAJS_GIVEN"] = traj_num
    config["TRAJ_SHIFT"] = traj_num

    config['ALG'] = "FINAL_BCIRMStudent_replicatedata_trajnum" + str(traj_num)


    ###############.  settings   ###############
    #config['ALG'] = "BCIRMStudent_Apr19_replicatedata"
    #config['METHOD'] = "BCIRM"
    #config['METHOD'] = "ICIL"
    #config["NUM_TRAJS_GIVEN"] = 50
    #config["TRAJ_SHIFT"] = 50
    #config['ENV'] == "CartPole-v1"
    ###############.  settings   ###############


    if config['METHOD'] == 'BCIRM':
        config['l2_regularizer_weight'] = 0.001
        config['penalty_weight'] = 10000
        config['penalty_anneal_iters'] = 5000

    all_results_trail = []

    for trail in range(1): 
        config['TRIAL'] = trail 


        ###############.  start a trail   ###############

        config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
        print("Config: %s" % config)

        TRIAL = config["TRIAL"] #args.trial
        print("Trial number %s" % TRIAL)

        results_dir_base = "testing/results/"
        results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

        config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

        results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
        results_file_path = os.path.join(results_dir, results_file_name)

        if os.path.exists(os.path.join(results_dir, config_file)):
            raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
        pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))




        ###############.  10 runs for each trail   ###############

        print("config method = ", config['METHOD'])
        print("config env = ", config['ENV'])

        for run_seed in range(config["NUM_REPETITIONS"]):
            print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
            student = make_student(run_seed, config)
            student.train(num_updates=config["NUM_STEPS_TRAIN"])

            env_wrapper_out_of_sample = EnvWrapper(
                env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
            )

            env_wrapper_out_of_sample.noise = 0

            action_match, return_mean, return_std = student.test(
                num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
            )

            result = (action_match, return_mean, return_std)
            print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
            save_results(results_file_path, run_seed, action_match, return_mean, return_std)

        results_trial = pd.read_csv(
            "testing/results/"
            + config["ENV"]
            + "/"
            + str(config["NUM_TRAJS_GIVEN"])
            + "/"
            + config["ALG"]
            + "/trial_"
            + str(TRIAL)
            + "_results.csv",
            header=None,
        )

        print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))

        all_results_trail.append(np.mean(results_trial[2].values))
    print("ALL RESULTS TRAIL:" , all_results_trail)




Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_BCIRMStudent_replicatedata_trajnum1', 'NUM_TRAJS_GIVEN': 1, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 1, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'BCIRM', 'l2_regularizer_weight': 0.001, 'penalty_weight': 10000, 'penalty_anneal_iters': 5000, 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  BCIRM
config env =  Acrobot-v1
Run 1 out of 15




state_dim 10
200 tensor(0.0741, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0458, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0321, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0254, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0221, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0204, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0195, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0190, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0187, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0186, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0184, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0184, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0182, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0182, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0730, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0452, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0318, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0251, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0218, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0202, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0193, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0188, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0186, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0184, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0182, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0182, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0821, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0538, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0395, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0323, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0285, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0263, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0250, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0242, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0235, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0231, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0228, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0226, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0224, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0223, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0222, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0221, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0750, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0466, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0323, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0252, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0217, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0199, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0190, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0184, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0180, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0178, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0177, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0177, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0176, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0175, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0175, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0748, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0457, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0320, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0255, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0222, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0206, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0197, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0192, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0189, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0188, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0186, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0185, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0185, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0184, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0935, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0630, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0479, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0406, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0369, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0351, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0341, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0335, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0331, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0328, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0325, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0324, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0322, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0321, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0320, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0319, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0747, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0470, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0337, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0271, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0238, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0220, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0211, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0205, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0202, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0200, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0199, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0198, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0197, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0196, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0195, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0195, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0847, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0551, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0410, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0342, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0307, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0289, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0278, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0272, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0268, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0265, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0262, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0260, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0259, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0258, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0257, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0257, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0807, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0529, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0391, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0322, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0284, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0263, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0250, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0242, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0238, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0235, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0232, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0231, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0230, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0229, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0228, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0227, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0777, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0488, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0340, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0264, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0225, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0205, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0194, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0188, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0185, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0182, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0182, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0181, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0180, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0180, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0848, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0549, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0397, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0321, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0282, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0261, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0249, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0242, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0237, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0233, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0230, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0227, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0225, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0224, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0223, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0222, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0741, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0459, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0323, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0256, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0222, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0205, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0197, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0192, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0189, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0187, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0186, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0185, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0184, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0183, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0717, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0440, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0313, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0254, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0225, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0211, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0204, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0200, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0198, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0196, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0195, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0194, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0194, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0193, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0192, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0192, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0795, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0510, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0372, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0304, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0270, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0251, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0241, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0234, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0230, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0228, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0227, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0226, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0225, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0224, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0224, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0223, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0783, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0505, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0366, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0296, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0260, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0240, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0229, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0222, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0219, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0216, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0214, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0213, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0212, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0211, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0210, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0209, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0862, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0564, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0418, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0344, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0306, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0285, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0272, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0265, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0260, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0257, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0255, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0254, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0253, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0252, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0251, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0251, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.1240, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0722, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0571, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0498, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0460, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0439, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0428, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0420, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0420, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0414, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0412, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0411, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0409, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0408, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0407, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0405, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0820, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0543, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0414, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0352, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0320, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0303, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0293, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0288, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0285, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0283, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0282, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0281, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0280, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0279, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0279, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0278, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.1229, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0937, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0774, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0577, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0530, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0503, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0485, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0470, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0457, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0446, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0437, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0430, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0425, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0420, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0417, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0415, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0842, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0559, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0416, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0344, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0307, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0287, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0275, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0267, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0262, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0259, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0257, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0255, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0254, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0254, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0253, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0253, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.1277, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0726, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0828, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0530, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0500, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0480, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0466, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0455, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0446, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0439, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0433, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0427, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0422, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0418, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0413, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0409, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0984, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0673, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0533, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0466, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0433, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0415, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0405, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0399, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0395, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0392, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0390, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0389, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0388, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0387, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0386, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0385, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.0938, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0650, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0504, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0429, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0389, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0366, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0353, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0344, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0339, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0335, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0332, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0330, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0328, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0327, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0326, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0325, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



state_dim 10
200 tensor(0.1843, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.0998, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.0727, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.0639, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.0597, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.0576, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.0564, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.0557, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.0553, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.0549, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.0546, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.0543, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.0541, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.0539, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.0538, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.0537, device='cuda:0', grad_fn=<AddBackward0>)
3400 tensor(0.0



[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
8400 tensor(3.0912e-05, device='cuda:0', grad_fn=<DivBackward0>)
8600 tensor(5.7856e-05, device='cuda:0', grad_fn=<DivBackward0>)
8800 tensor(2.1144e-05, device='cuda:0', grad_fn=<DivBackward0>)
9000 tensor(1.6904e-05, device='cuda:0', grad_fn=<DivBackward0>)
9200 tensor(0.0002, device='cuda:0', grad_fn=<DivBackward0>)
9400 tensor(0.0004, device='cuda:0', grad_fn=<DivBackward0>)
9600 tensor(0.0005, device='cuda:0', grad_fn=<DivBackward0>)
9800 tensor(0.0007, device='cuda:0', grad_fn=<DivBackward0>)
10000 tensor(0.0004, device='cuda:0', grad_fn=<DivBackward0>)
###############    Reward for test environment for run 15: -86.52.   ###############


Average reward for 10 repetitions: -84.42999999999999
ALL RESULTS TRAIL: [-84.42999999999999]
Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_BCIRMStudent_replicatedata_trajnum6', 'NUM_TRAJS_GIVEN': 6, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 6, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAI

#10 Trails -- BC

In [8]:
config['METHOD'] = "BC"

for traj_num in [1,2,3,4,5,6,7,8,9,10, 20]:
    config["NUM_TRAJS_GIVEN"] = traj_num
    config["TRAJ_SHIFT"] = traj_num

    config['ALG'] = "FINAL_BCStudent_replicatedata_trajnum" + str(traj_num)



    ###############.  settings   ###############
    #config['ALG'] = "BCStudent_Apr19_replicatedata"
    #config['METHOD'] = "BC"
    #config['ENV'] == "CartPole-v1"
    #config['ENV'] == "LunarLander-v2"
    #config["NUM_TRAJS_GIVEN"] = 20
    #config["TRAJ_SHIFT"] = 20
    ###############.  settings   ###############



    if config['METHOD'] == 'BCIRM':
        config['l2_regularizer_weight'] = 0.001
        config['penalty_weight'] = 10000
        config['penalty_anneal_iters'] = 5000

    all_results_trail = []

    for trail in range(1): 
        config['TRIAL'] = trail 


        ###############.  start a trail   ###############

        config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
        print("Config: %s" % config)

        TRIAL = config["TRIAL"] #args.trial
        print("Trial number %s" % TRIAL)

        results_dir_base = "testing/results/"
        results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

        config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

        results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
        results_file_path = os.path.join(results_dir, results_file_name)

        if os.path.exists(os.path.join(results_dir, config_file)):
            raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
        pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))




        ###############.  10 runs for each trail   ###############

        print("config method = ", config['METHOD'])
        print("config env = ", config['ENV'])

        for run_seed in range(config["NUM_REPETITIONS"]):
            print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
            student = make_student(run_seed, config)
            student.train(num_updates=config["NUM_STEPS_TRAIN"])

            env_wrapper_out_of_sample = EnvWrapper(
                env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
            )
            action_match, return_mean, return_std = student.test(
                num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
            )

            result = (action_match, return_mean, return_std)
            print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
            save_results(results_file_path, run_seed, action_match, return_mean, return_std)

        results_trial = pd.read_csv(
            "testing/results/"
            + config["ENV"]
            + "/"
            + str(config["NUM_TRAJS_GIVEN"])
            + "/"
            + config["ALG"]
            + "/trial_"
            + str(TRIAL)
            + "_results.csv",
            header=None,
        )

        print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))

        all_results_trail.append(np.mean(results_trial[2].values))

    print("ALL RESULTS TRAIL:" , all_results_trail)

Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_BCStudent_replicatedata_trajnum1', 'NUM_TRAJS_GIVEN': 1, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 1, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'BC', 'l2_regularizer_weight': 0.001, 'penalty_weight': 10000, 'penalty_anneal_iters': 5000, 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  BC
config env =  Acrobot-v1
Run 1 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -118.52.   ###############


Run 2 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: -100.25.   ###############


Run 3 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: -102.23.   ###############


Run 4 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: -375.57.   ###############


Run 5 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: -130.75.   ###############


Run 6 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: -244.06.   ###############


Run 7 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 7: -82.69.   ###############


Run 8 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 8: -117.81.   ###############


Run 9 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 9: -500.0.   ###############


Run 10 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 10: -119.62.   ###############


Run 11 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 11: -90.9.   ###############


Run 12 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 12: -168.51.   ###############


Run 13 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 13: -281.65.   ###############


Run 14 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 14: -115.94.   ###############


Run 15 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 15: -302.15.   ###############


Average reward for 10 repetitions: -190.04333333333335
ALL RESULTS TRAIL: [-190.04333333333335]
Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_BCStudent_replicatedata_trajnum2', 'NUM_TRAJS_GIVEN': 2, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 2, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'BC', 'l2_regularizer_weight': 0.001, 'penalty_weight': 10000, 'penalty_anneal_iters': 5000, 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  BC
config env =  Acrobot-v1
Run 1 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -88.08.   ###############


Run 2 out of 15
state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: -85.21.   ###############


Run 3 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: -85.75.   ###############


Run 4 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: -86.67.   ###############


Run 5 out of 15
state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: -85.09.   ###############


Run 6 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: -84.42.   ###############


Run 7 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 7: -93.94.   ###############


Run 8 out of 15
state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 8: -85.38.   ###############


Run 9 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 9: -112.71.   ###############


Run 10 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 10: -82.59.   ###############


Run 11 out of 15
state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 11: -85.82.   ###############


Run 12 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 12: -88.11.   ###############


Run 13 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 13: -139.93.   ###############


Run 14 out of 15
state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 14: -81.23.   ###############


Run 15 out of 15




state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 15: -85.72.   ###############


Average reward for 10 repetitions: -91.37666666666668
ALL RESULTS TRAIL: [-91.37666666666668]
Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_BCStudent_replicatedata_trajnum3', 'NUM_TRAJS_GIVEN': 3, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 3, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'BC', 'l2_regularizer_weight': 0.001, 'penalty_weight': 10000, 'penalty_anneal_iters': 5000, 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  BC
config env =  Acrobot-v1
Run 1 out of 15
state_dim 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000

#10 Trails -- ICIL

In [6]:
config['METHOD'] = "ICIL"

for traj_num in [8,9,10, 20]:
    config["NUM_TRAJS_GIVEN"] = traj_num
    config["TRAJ_SHIFT"] = traj_num

    config['ALG'] = "FINAL_ICILStudent_replicatedata_trajnum" + str(traj_num)



    ###############.  settings   ###############
    #config['ALG'] = "ICILStudent_Apr19_replicatedata"
    #config['METHOD'] = "ICIL"
    #config['ENV'] == "CartPole-v1"
    #config['ENV'] == "LunarLander-v2"

    ###############.  settings   ###############



    if config['METHOD'] == 'BCIRM':
        config['l2_regularizer_weight'] = 0.001
        config['penalty_weight'] = 10000
        config['penalty_anneal_iters'] = 100

    all_results_trail = []

    for trail in range(1): 
        config['TRIAL'] = trail 


        ###############.  start a trail   ###############

        config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
        print("Config: %s" % config)

        TRIAL = config["TRIAL"] #args.trial
        print("Trial number %s" % TRIAL)

        results_dir_base = "testing/results/"
        results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

        config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

        results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
        results_file_path = os.path.join(results_dir, results_file_name)

        if os.path.exists(os.path.join(results_dir, config_file)):
            raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
        pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))




        ###############.  10 runs for each trail   ###############

        print("config method = ", config['METHOD'])
        print("config env = ", config['ENV'])

        for run_seed in range(config["NUM_REPETITIONS"]):
            print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
            student = make_student(run_seed, config)
            student.train(num_updates=config["NUM_STEPS_TRAIN"])

            env_wrapper_out_of_sample = EnvWrapper(
                env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
            )
            action_match, return_mean, return_std = student.test(
                num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
            )

            result = (action_match, return_mean, return_std)
            print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
            save_results(results_file_path, run_seed, action_match, return_mean, return_std)

        results_trial = pd.read_csv(
            "testing/results/"
            + config["ENV"]
            + "/"
            + str(config["NUM_TRAJS_GIVEN"])
            + "/"
            + config["ALG"]
            + "/trial_"
            + str(TRIAL)
            + "_results.csv",
            header=None,
        )

        print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))

        all_results_trail.append(np.mean(results_trial[2].values))

    print("ALL RESULTS TRAIL:" , all_results_trail)

Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_ICILStudent_replicatedata_trajnum8', 'NUM_TRAJS_GIVEN': 8, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 8, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 15, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 0, 'METHOD': 'ICIL', 'EXPERT_ALG': 'dqn'}
Trial number 0
config method =  ICIL
config env =  Acrobot-v1
Run 1 out of 15
state_dim 10


100%|██████████| 1000/1000 [01:14<00:00, 13.48it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -86.79.   ###############


Run 2 out of 15


  0%|          | 2/1000 [00:00<01:11, 13.86it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.15it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: -83.63.   ###############


Run 3 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.60it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.63it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: -83.6.   ###############


Run 4 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.62it/s]

state_dim 10


100%|██████████| 1000/1000 [01:14<00:00, 13.35it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: -82.5.   ###############


Run 5 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.18it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.59it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: -83.48.   ###############


Run 6 out of 15


  0%|          | 2/1000 [00:00<01:16, 13.12it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.20it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: -83.04.   ###############


Run 7 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.27it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.29it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 7: -82.98.   ###############


Run 8 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.56it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 12.94it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 8: -81.38.   ###############


Run 9 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.51it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.40it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 9: -81.1.   ###############


Run 10 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.53it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 12.76it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 10: -85.2.   ###############


Run 11 out of 15


  0%|          | 2/1000 [00:00<01:17, 12.81it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.02it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 11: -84.6.   ###############


Run 12 out of 15


  0%|          | 2/1000 [00:00<01:11, 13.90it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.37it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 12: -84.61.   ###############


Run 13 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.17it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.24it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 13: -83.11.   ###############


Run 14 out of 15


  0%|          | 2/1000 [00:00<01:12, 13.76it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.18it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 14: -86.18.   ###############


Run 15 out of 15


  0%|          | 2/1000 [00:00<01:16, 13.10it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 12.56it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 15: -82.68.   ###############


Average reward for 10 repetitions: -83.65866666666668
ALL RESULTS TRAIL: [-83.65866666666668]
Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_ICILStudent_replicatedata_trajnum9', 'NUM_TRAJS_GIVEN': 9, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 9, 'SAMPLING_RATE

  0%|          | 2/1000 [00:00<01:14, 13.44it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.09it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -81.63.   ###############


Run 2 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.21it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.12it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: -86.62.   ###############


Run 3 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.48it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.30it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: -85.37.   ###############


Run 4 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.53it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.27it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: -80.43.   ###############


Run 5 out of 15


  0%|          | 2/1000 [00:00<01:11, 13.95it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.23it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: -83.27.   ###############


Run 6 out of 15


  0%|          | 2/1000 [00:00<01:11, 13.94it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 12.77it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: -87.53.   ###############


Run 7 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.14it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.15it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 7: -85.77.   ###############


Run 8 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.40it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.26it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 8: -82.9.   ###############


Run 9 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.34it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 13.20it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 9: -84.31.   ###############


Run 10 out of 15


  0%|          | 2/1000 [00:00<01:17, 12.80it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 13.11it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 10: -81.89.   ###############


Run 11 out of 15


  0%|          | 2/1000 [00:00<01:16, 13.05it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.05it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 11: -81.3.   ###############


Run 12 out of 15


  0%|          | 2/1000 [00:00<01:22, 12.16it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 12.59it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 12: -85.05.   ###############


Run 13 out of 15


  0%|          | 2/1000 [00:00<01:12, 13.68it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 13.17it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 13: -83.65.   ###############


Run 14 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.45it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 12.67it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 14: -83.07.   ###############


Run 15 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.49it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 12.76it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 15: -82.74.   ###############


Average reward for 10 repetitions: -83.70199999999998
ALL RESULTS TRAIL: [-83.70199999999998]
Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_ICILStudent_replicatedata_trajnum10', 'NUM_TRAJS_GIVEN': 10, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 10, 'SAMPLING_R

  0%|          | 2/1000 [00:00<01:25, 11.65it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 12.48it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -85.49.   ###############


Run 2 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.59it/s]

state_dim 10


100%|██████████| 1000/1000 [01:18<00:00, 12.84it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: -83.95.   ###############


Run 3 out of 15


  0%|          | 2/1000 [00:00<01:23, 11.98it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.02it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: -81.12.   ###############


Run 4 out of 15


  0%|          | 2/1000 [00:00<01:16, 13.08it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.71it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: -85.39.   ###############


Run 5 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.48it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.27it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: -82.88.   ###############


Run 6 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.13it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.46it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: -84.18.   ###############


Run 7 out of 15


  0%|          | 2/1000 [00:00<01:16, 13.10it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.04it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 7: -81.4.   ###############


Run 8 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.33it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.33it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 8: -84.31.   ###############


Run 9 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.61it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.71it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 9: -87.36.   ###############


Run 10 out of 15


  0%|          | 2/1000 [00:00<01:16, 13.04it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.70it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 10: -85.25.   ###############


Run 11 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.58it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.46it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 11: -84.03.   ###############


Run 12 out of 15


  0%|          | 2/1000 [00:00<01:18, 12.64it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.44it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 12: -81.33.   ###############


Run 13 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.64it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.27it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 13: -83.82.   ###############


Run 14 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.19it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.38it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 14: -85.83.   ###############


Run 15 out of 15


  0%|          | 2/1000 [00:00<01:17, 12.81it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.23it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 15: -83.74.   ###############


Average reward for 10 repetitions: -84.00533333333333
ALL RESULTS TRAIL: [-84.00533333333333]
Config: {'ENV': 'Acrobot-v1', 'ALG': 'FINAL_ICILStudent_replicatedata_trajnum20', 'NUM_TRAJS_GIVEN': 20, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 20, 'SAMPLING_R

  0%|          | 2/1000 [00:00<01:16, 13.05it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.49it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: -83.51.   ###############


Run 2 out of 15


  0%|          | 2/1000 [00:00<01:12, 13.69it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.31it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: -82.1.   ###############


Run 3 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.56it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.76it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: -84.21.   ###############


Run 4 out of 15


  0%|          | 2/1000 [00:00<01:13, 13.63it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.45it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: -88.96.   ###############


Run 5 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.43it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.20it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: -82.1.   ###############


Run 6 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.28it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.86it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: -81.32.   ###############


Run 7 out of 15


  0%|          | 2/1000 [00:00<01:14, 13.41it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.33it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 7: -83.27.   ###############


Run 8 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.14it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 13.07it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 8: -84.23.   ###############


Run 9 out of 15


  0%|          | 2/1000 [00:00<01:19, 12.60it/s]

state_dim 10


100%|██████████| 1000/1000 [01:15<00:00, 12.79it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 9: -85.04.   ###############


Run 10 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.29it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.05it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 10: -85.65.   ###############


Run 11 out of 15


  0%|          | 2/1000 [00:00<01:20, 12.35it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 13.26it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 11: -85.51.   ###############


Run 12 out of 15


  0%|          | 2/1000 [00:00<01:21, 12.29it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 12.54it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 12: -82.36.   ###############


Run 13 out of 15


  0%|          | 2/1000 [00:00<01:15, 13.15it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.56it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 13: -84.28.   ###############


Run 14 out of 15


  0%|          | 2/1000 [00:00<01:12, 13.76it/s]

state_dim 10


100%|██████████| 1000/1000 [01:16<00:00, 13.54it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 14: -88.52.   ###############


Run 15 out of 15


  0%|          | 2/1000 [00:00<01:17, 12.95it/s]

state_dim 10


100%|██████████| 1000/1000 [01:17<00:00, 12.42it/s]


self.noise_features_encoders [FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
), FeaturesEncoder(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
)]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 15: -87.96.   ###############


Average reward for 10 repetitions: -84.60133333333333
ALL RESULTS TRAIL: [-84.60133333333333]


#line by line

In [None]:

config["EXPERT_ALG"] = yaml.load(open("testing/config.yml"), Loader=yaml.FullLoader)[config["ENV"]]
print("Config: %s" % config)


TRIAL = config["TRIAL"] #args.trial
print("Trial number %s" % TRIAL)


results_dir_base = "testing/results/"
results_dir = os.path.join(results_dir_base, config["ENV"], str(config["NUM_TRAJS_GIVEN"]), config["ALG"])


if not os.path.exists(results_dir):
    os.makedirs(results_dir)



config_file = "trial_" + str(TRIAL) + "_" + "config.pkl"

results_file_name = "trial_" + str(TRIAL) + "_" + "results.csv"
results_file_path = os.path.join(results_dir, results_file_name)

if os.path.exists(os.path.join(results_dir, config_file)):
    raise NameError("CONFIG file already exists %s. Choose a different trial number." % config_file)
pickle.dump(config, open(os.path.join(results_dir, config_file), "wb"))


Config: {'ENV': 'CartPole-v1', 'ALG': 'ICILStudent', 'NUM_TRAJS_GIVEN': 10, 'NUM_TRAINING_ENVS': 2, 'NOISE_DIM': 4, 'REP_SIZE': 16, 'TRAJ_SHIFT': 20, 'SAMPLING_RATE': 5, 'NUM_STEPS_TRAIN': 10000, 'NUM_TRAJS_VALID': 100, 'NUM_REPETITIONS': 10, 'BATCH_SIZE': 64, 'MLP_WIDTHS': 64, 'ADAM_ALPHA': 0.001, 'SGLD_BUFFER_SIZE': 10000, 'SGLD_LEARN_RATE': 0.01, 'SGLD_NOISE_COEF': 0.01, 'SGLD_NUM_STEPS': 100, 'SGLD_REINIT_FREQ': 0.05, 'NUM_STEPS_TRAIN_ENERGY_MODEL': 1000, 'TRIAL': 8, 'METHOD': 'BC', 'EXPERT_ALG': 'dqn'}
Trial number 8


In [None]:

#"""
if __name__ == "__main__":
   
    print("config method = ", config['METHOD'])
    print("config env = ", config['ENV'])

    for run_seed in range(config["NUM_REPETITIONS"]):
        print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
        student = make_student(run_seed, config)
        student.train(num_updates=config["NUM_STEPS_TRAIN"])

        env_wrapper_out_of_sample = EnvWrapper(
            env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
        )
        action_match, return_mean, return_std = student.test(
            num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
        )

        result = (action_match, return_mean, return_std)
        print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
        save_results(results_file_path, run_seed, action_match, return_mean, return_std)

    results_trial = pd.read_csv(
        "testing/results/"
        + config["ENV"]
        + "/"
        + str(config["NUM_TRAJS_GIVEN"])
        + "/"
        + config["ALG"]
        + "/trial_"
        + str(TRIAL)
        + "_results.csv",
        header=None,
    )

    print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))
#"""

config method =  BC
config env =  CartPole-v1
Run 1 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 1: 166.1.   ###############


Run 2 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 2: 155.74.   ###############


Run 3 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 3: 496.46.   ###############


Run 4 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 4: 500.0.   ###############


Run 5 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 5: 500.0.   ###############


Run 6 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
###############    Reward for test environment for run 6: 274.88.   ###############


Run 7 out of 10
0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [None]:

#"""
if __name__ == "__main__":
   
    print("config method = ", config['METHOD'])
    print("config env = ", config['ENV'])

    for run_seed in range(config["NUM_REPETITIONS"]):
        print("Run %s out of %s" % (run_seed + 1, config["NUM_REPETITIONS"]))
        student = make_student(run_seed, config)
        student.train(num_updates=config["NUM_STEPS_TRAIN"])

        env_wrapper_out_of_sample = EnvWrapper(
            env=gym.make(config["ENV"]), mult_factor=get_test_mult_factors(config['NOISE_DIM'] - 1), idx=3, seed=1
        )
        action_match, return_mean, return_std = student.test(
            num_episodes=config["NUM_TRAJS_VALID"], env_wrapper=env_wrapper_out_of_sample
        )

        result = (action_match, return_mean, return_std)
        print("###############    Reward for test environment for run %s: %s.   ###############\n\n" % (run_seed + 1, return_mean))
        save_results(results_file_path, run_seed, action_match, return_mean, return_std)

    results_trial = pd.read_csv(
        "testing/results/"
        + config["ENV"]
        + "/"
        + str(config["NUM_TRAJS_GIVEN"])
        + "/"
        + config["ALG"]
        + "/trial_"
        + str(TRIAL)
        + "_results.csv",
        header=None,
    )

    print("Average reward for 10 repetitions: %s" % np.mean(results_trial[2].values))
#"""

config method =  BCIRM
config env =  CartPole-v1
Run 1 out of 10
200 tensor(0.4755, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(0.4851, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(0.4670, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(0.4648, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(0.4254, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(0.5061, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(0.5847, device='cuda:0', grad_fn=<AddBackward0>)
1600 tensor(0.4992, device='cuda:0', grad_fn=<AddBackward0>)
1800 tensor(0.3855, device='cuda:0', grad_fn=<AddBackward0>)
2000 tensor(0.4569, device='cuda:0', grad_fn=<AddBackward0>)
2200 tensor(0.5494, device='cuda:0', grad_fn=<AddBackward0>)
2400 tensor(0.3457, device='cuda:0', grad_fn=<AddBackward0>)
2600 tensor(0.4519, device='cuda:0', grad_fn=<AddBackward0>)
2800 tensor(0.3749, device='cuda:0', grad_fn=<AddBackward0>)
3000 tensor(0.5179, device='cuda:0', grad_fn=<AddBackward0>)
3200 tensor(0.6392, devi

# *** *core* *** generate expert_traj_i 

In [None]:



def get_train_spurcorr_expert_trajs(datafile_name, env_name,  _noise = 0.001,  num_envs = 2):

    from tqdm import tqdm 
    import numpy as np

    def generate_spurcorr_obs(observations, _mult_factor, _noise = 0.001, _idx = 0):
        noise_dims = len(_mult_factor)
        #obs_noise = np.zeros_like(observations)
        #obs_noise[-noise_dims:] = np.random.randn(noise_dims) * _noise
        spur_corr = np.matmul(observations[-noise_dims:], _mult_factor)
        #obs = np.concatenate([observations + obs_noise, spur_corr, [_idx]])
        obs = np.concatenate([observations , spur_corr + np.random.randn(noise_dims) * _noise, [_idx]])
        return obs

    raw = np.load(datafile_name, allow_pickle = True)#[()]#["trajs"]

    obs_dim = len(raw['obs'][0])
    obs_num = len(raw['obs'])

    for expert_num in range(num_envs):
        # print(_mult_factor_multipliers[expert_num])
            
        obs_new = np.zeros(shape = (obs_num, obs_dim *2))

        #_mult_factor = np.diag(np.ones(obs_dim-1)) * _mult_factor_multipliers[expert_num]
        if expert_num ==0:
            _mult_factor = np.diag(np.ones(obs_dim-1))
        elif expert_num ==1:
            _mult_factor = np.ones((obs_dim-1, obs_dim-1)) + np.diag(np.ones(obs_dim-1))


        for i in tqdm(range(obs_num)):
            obs_new[i] = generate_spurcorr_obs(raw['obs'][i], _mult_factor, _noise = 0.001, _idx = expert_num)


        start_index = np.where(raw['episode_starts'] == 1)[0]

        data_block = []
        for i in range(len(start_index)-1):
            slice_idx = np.arange(start_index[i], start_index[i+1])

            obs = obs_new[slice_idx]
            actions = raw['actions'][slice_idx]

            data_list = []
            for j in range(len(slice_idx)):
                data_list.append( (obs[j], actions[j][0], expert_num) )

            data_block.append(data_list)

        data_generated = {'trajs': data_block}  


        isExist = os.path.exists("./volume/" + env_name)
        if not isExist:
            # Create a new directory because it does not exist 
            os.makedirs("./volume/" + env_name)
            print("The new directory for {} is created!".format(env_name))


        np.save("./volume/" + env_name + '/expert_trajs_' + str(expert_num) + '.npy', data_generated)
        print("\n{} saved!".format("./volume/" + env_name + '/expert_trajs_' + str(expert_num) + '.npy'))
        
    #return data_generated

In [None]:
np.ones((obs_dim-1, obs_dim-1)) + np.diag(np.ones(obs_dim-1))

array([[2., 1., 1.],
       [1., 2., 1.],
       [1., 1., 2.]])

In [None]:
import os
os.getcwd()

raw = np.load(datafile_name, allow_pickle = True)
obs_dim = len(raw['obs'][0])
obs_num = len(raw['obs'])

(raw['episode_starts'] == 1).sum()

1000

In [None]:
#datafile_name = "expert_lunarlander_timesteps2e5_episodes10.npz"
#datafile_name = 'expert_cartpole_timesteps2e5_episodes10.npz'
#datafile_name = './contrib/expert_replicate/' + 'expert_cartpole_dqn_replicate_episodes1000.npz'
datafile_name = './contrib/expert_replicate/' + 'expert_lunarlander_ppo2_replicate_episodes1000.npz'



env_name = 'LunarLander-v2'
#env_name = 'CartPole-v1'

get_train_spurcorr_expert_trajs(datafile_name, env_name = env_name,  _noise = 0.001, num_envs = 2)



  0%|          | 0/902005 [00:00<?, ?it/s][A
  0%|          | 3/902005 [00:00<11:15:03, 22.27it/s][A
  0%|          | 5/902005 [00:00<11:38:04, 21.54it/s][A
  0%|          | 8/902005 [00:00<11:35:07, 21.63it/s][A
  0%|          | 11/902005 [00:00<11:29:04, 21.82it/s][A
  0%|          | 14/902005 [00:00<11:27:21, 21.87it/s][A
  0%|          | 17/902005 [00:00<11:26:43, 21.89it/s][A
  0%|          | 19/902005 [00:00<11:52:15, 21.11it/s][A
  0%|          | 22/902005 [00:01<11:48:50, 21.21it/s][A
  0%|          | 25/902005 [00:01<11:44:49, 21.33it/s][A
  0%|          | 28/902005 [00:01<11:48:11, 21.23it/s][A
  0%|          | 31/902005 [00:01<11:55:39, 21.01it/s][A
  0%|          | 34/902005 [00:01<11:56:12, 20.99it/s][A
  0%|          | 37/902005 [00:01<11:51:06, 21.14it/s][A
  0%|          | 40/902005 [00:01<11:56:15, 20.99it/s][A
  0%|          | 43/902005 [00:02<11:47:26, 21.25it/s][A
  0%|          | 46/902005 [00:02<11:44:04, 21.35it/s][A
  0%|          | 49/902005 [

KeyboardInterrupt: ignored

In [None]:
data1 = np.load("./volume/CartPole-v1/expert_trajs_1.npy", allow_pickle = True)#[()]#["trajs"]
for key in data1[()]:
  print(key)

trajs
