In [13]:
from lib.algorithms import Q_learn_freetime, Q_learn, freetime_no_reward

In [27]:
import hydra
hydra.initialize(config_path='configs')
cfg = hydra.compose(config_name='config_env3_corners')


ValueError: GlobalHydra is already initialized, call GlobalHydra.instance().clear() if you want to re-initialize

In [32]:
cfg = hydra.compose(config_name='config_env1')
from lib.gym_windy_gridworld import WindyGridworld
env = WindyGridworld(
        height=cfg.env.height, 
        width=cfg.env.width, 
        rewards=list(cfg.env.rewards), 
        wind=cfg.env.wind, 
        start=cfg.env.start, 
        allowed_actions=list(cfg.env.allowed_actions), 
        reward_terminates_episode=cfg.env.reward_terminates_episode
    )
    

In [33]:
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 5, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 0, 5]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['min'], 'show_trajectory': False}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 1}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [34]:
cfg.freetime.num_steps = 100000
cfg.baseline.num_steps = 100000
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 5, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 0, 5]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['min'], 'show_trajectory': False}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 1}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [35]:
from lib.algorithms import build_q_table

def grid_search(initialization):
    
    results = {}
    for tolerance in [-0.01, -0.001, 0.0, 0.001, 0.01, 0.1, 0.5]:
        
        results[tolerance] = []
        
        for exp in range(5):
            
            Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
            
            Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                tolerance
            )
            
            results[tolerance].append(rewards[-1])
    # Baseline
    results['baseline'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, rewards = Q_learn(
                env, 
                Q, 
                cfg.baseline.num_steps, 
                cfg.baseline.epsilon, 
                cfg.baseline.discount, 
                cfg.baseline.alpha
            )
        
        results['baseline'].append(rewards[-1])

    # Ratio
    results['ratio'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, F, rewards, _ = freetime_no_reward(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                cfg.freetime.tolerance
            )
        
        results['ratio'].append(rewards[-1])
        
    results = {k: sum(v)/len(v) for k, v in results.items()} 
    return results


In [21]:
results_rand = grid_search('random')
results_rand

100%|██████████| 100000/100000 [00:03<00:00, 27815.71it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27777.05it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27924.56it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27302.65it/s]
100%|██████████| 100000/100000 [00:03<00:00, 25924.77it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27381.22it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28224.31it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28216.34it/s]
100%|██████████| 100000/100000 [00:03<00:00, 25891.18it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21812.92it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21076.69it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23133.61it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22754.28it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22754.28it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23738.19it/s]
100%|██████████| 100000/100000 [00:03<00:00, 25407.26it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24630.60it/

{-0.01: 4343.6,
 -0.001: 4309.0,
 0.0: 4578.8,
 0.001: 3705.8,
 0.01: 6098.8,
 0.1: 271.2,
 0.5: 273.4,
 'baseline': 2392.0,
 'ratio': 3241.6}

In [23]:
results_opt = grid_search('optimistic')
results_opt

100%|██████████| 100000/100000 [00:04<00:00, 20213.24it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21092.26it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21410.92it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20632.74it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20552.11it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21786.75it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21429.28it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20028.88it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20168.36it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20461.61it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20834.96it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20937.56it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21491.51it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21130.18it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21038.97it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21206.42it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20804.59it/

{-0.01: 1163.8,
 -0.001: 1162.0,
 0.0: 1164.2,
 0.001: 1161.0,
 0.01: 1161.6,
 0.1: 285.4,
 0.5: 279.2,
 'baseline': 1163.4,
 'ratio': 1162.2}

In [22]:
results_pess = grid_search('pessimistic')
results_pess

100%|██████████| 100000/100000 [00:04<00:00, 23233.12it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20315.99it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22908.18it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23547.96it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24087.28it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22958.19it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21381.14it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21274.16it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22475.32it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22250.09it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21376.56it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21582.04it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21586.70it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21456.90it/s]
100%|██████████| 100000/100000 [00:04<00:00, 20739.81it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21127.95it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21157.02it/

{-0.01: 8517.2,
 -0.001: 8665.4,
 0.0: 8277.8,
 0.001: 1116.6,
 0.01: 799.6,
 0.1: 434.4,
 0.5: 274.6,
 'baseline': 8130.8,
 'ratio': 1184.8}

In [24]:
print("Pess Init Env 4: No wind")
results_pess

Pess Init Env 4: No wind


{-0.01: 8517.2,
 -0.001: 8665.4,
 0.0: 8277.8,
 0.001: 1116.6,
 0.01: 799.6,
 0.1: 434.4,
 0.5: 274.6,
 'baseline': 8130.8,
 'ratio': 1184.8}

In [25]:
print("Opt Init Env 4: No wind")
results_opt

Opt Init Env 4: No wind


{-0.01: 1163.8,
 -0.001: 1162.0,
 0.0: 1164.2,
 0.001: 1161.0,
 0.01: 1161.6,
 0.1: 285.4,
 0.5: 279.2,
 'baseline': 1163.4,
 'ratio': 1162.2}

In [26]:
print("Rand Init Env 4: No wind")
results_rand

Rand Init Env 4: No wind


{-0.01: 4343.6,
 -0.001: 4309.0,
 0.0: 4578.8,
 0.001: 3705.8,
 0.01: 6098.8,
 0.1: 271.2,
 0.5: 273.4,
 'baseline': 2392.0,
 'ratio': 3241.6}