In [13]:
from lib.algorithms import Q_learn_freetime, Q_learn, freetime_no_reward

In [27]:
import hydra
hydra.initialize(config_path='configs')
cfg = hydra.compose(config_name='config_env3_corners')


ValueError: GlobalHydra is already initialized, call GlobalHydra.instance().clear() if you want to re-initialize

In [48]:
cfg = hydra.compose(config_name='config')
from lib.gym_windy_gridworld import WindyGridworld
env = WindyGridworld(
        height=cfg.env.height, 
        width=cfg.env.width, 
        rewards=list(cfg.env.rewards), 
        wind=cfg.env.wind, 
        start=cfg.env.start, 
        allowed_actions=list(cfg.env.allowed_actions), 
        reward_terminates_episode=cfg.env.reward_terminates_episode
    )
    

In [49]:
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 5, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 1, 6], [1, 3, 4]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': False}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['min'], 'show_trajectory': True}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 2}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [50]:
cfg.freetime.num_steps = 100000
cfg.baseline.num_steps = 100000
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 5, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 1, 6], [1, 3, 4]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': False}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['min'], 'show_trajectory': True}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 2}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [51]:
from lib.algorithms import build_q_table

def grid_search(initialization):
    
    results = {}
    for tolerance in [-0.01, -0.001, 0.0, 0.001, 0.01, 0.1, 0.5]:
        
        results[tolerance] = []
        
        for exp in range(5):
            
            Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
            
            Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                tolerance
            )
            
            results[tolerance].append(rewards[-1])
    # Baseline
    results['baseline'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, rewards = Q_learn(
                env, 
                Q, 
                cfg.baseline.num_steps, 
                cfg.baseline.epsilon, 
                cfg.baseline.discount, 
                cfg.baseline.alpha
            )
        
        results['baseline'].append(rewards[-1])

    # Ratio
    results['ratio'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, F, rewards, _ = freetime_no_reward(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                cfg.freetime.tolerance
            )
        
        results['ratio'].append(rewards[-1])
        
    results = {k: sum(v)/len(v) for k, v in results.items()} 
    return results


In [52]:
results_rand = grid_search('random')
results_rand

100%|██████████| 100000/100000 [00:10<00:00, 9780.15it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9741.06it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9734.89it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9740.58it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9772.02it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9693.80it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9731.10it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9757.71it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9779.68it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9766.29it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9738.21it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9746.76it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9750.09it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9717.37it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9730.15it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9738.21it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9818.60it/s]
100%|█████████

{-0.01: 5755.8,
 -0.001: 6768.8,
 0.0: 5867.8,
 0.001: 6072.6,
 0.01: 6008.6,
 0.1: 5026.2,
 0.5: 1411.6,
 'baseline': 4662.2,
 'ratio': 5054.6}

In [53]:
results_opt = grid_search('optimistic')
results_opt

100%|██████████| 100000/100000 [00:10<00:00, 9668.01it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9594.21it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9685.82it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9672.69it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9507.47it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9680.19it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9520.16it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9622.39it/s]
100%|██████████| 100000/100000 [00:11<00:00, 8993.16it/s]
100%|██████████| 100000/100000 [00:11<00:00, 8940.46it/s]
100%|██████████| 100000/100000 [00:11<00:00, 8905.79it/s]
100%|██████████| 100000/100000 [00:11<00:00, 8891.52it/s]
100%|██████████| 100000/100000 [00:11<00:00, 9042.00it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9123.14it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9345.19it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9395.27it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9313.82it/s]
100%|█████████

{-0.01: 8805.8,
 -0.001: 8859.8,
 0.0: 8751.0,
 0.001: 8674.2,
 0.01: 8792.2,
 0.1: 4371.8,
 0.5: 1466.2,
 'baseline': 8769.2,
 'ratio': 8753.8}

In [54]:
results_pess = grid_search('pessimistic')
results_pess

100%|██████████| 100000/100000 [00:10<00:00, 9675.03it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9812.81it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9593.28it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9777.76it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9793.57it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9815.70it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9925.89it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9872.93it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9889.05it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9858.80it/s]
100%|██████████| 100000/100000 [00:09<00:00, 10035.06it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9913.09it/s]
100%|██████████| 100000/100000 [00:09<00:00, 10093.86it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9900.81it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9926.88it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9920.97it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9991.90it/s]
100%|███████

{-0.01: 9165.6,
 -0.001: 9201.0,
 0.0: 8881.6,
 0.001: 7806.4,
 0.01: 6556.0,
 0.1: 3995.0,
 0.5: 1418.8,
 'baseline': 7691.8,
 'ratio': 8901.4}

In [55]:
print("Pess Init Env 1: Center Reward")
results_pess

Pess Init Env 1: Center Reward


{-0.01: 9165.6,
 -0.001: 9201.0,
 0.0: 8881.6,
 0.001: 7806.4,
 0.01: 6556.0,
 0.1: 3995.0,
 0.5: 1418.8,
 'baseline': 7691.8,
 'ratio': 8901.4}

In [56]:
print("Opt Init Env 1: Center Reward")
results_opt

Opt Init Env 1: Center Reward


{-0.01: 8805.8,
 -0.001: 8859.8,
 0.0: 8751.0,
 0.001: 8674.2,
 0.01: 8792.2,
 0.1: 4371.8,
 0.5: 1466.2,
 'baseline': 8769.2,
 'ratio': 8753.8}

In [57]:
print("Rand Init Env 1: Center Reward")
results_rand

Rand Init Env 1: Center Reward


{-0.01: 5755.8,
 -0.001: 6768.8,
 0.0: 5867.8,
 0.001: 6072.6,
 0.01: 6008.6,
 0.1: 5026.2,
 0.5: 1411.6,
 'baseline': 4662.2,
 'ratio': 5054.6}