In [1]:
from lib.algorithms import Q_learn_freetime, Q_learn

In [2]:
import hydra
hydra.initialize(config_path='configs')
cfg = hydra.compose(config_name='config')


In [21]:
cfg = hydra.compose(config_name='config')
from lib.gym_windy_gridworld import WindyGridworld
env = WindyGridworld(
        height=cfg.env.height, 
        width=cfg.env.width, 
        rewards=list(cfg.env.rewards), 
        wind=cfg.env.wind, 
        start=cfg.env.start, 
        allowed_actions=list(cfg.env.allowed_actions), 
        reward_terminates_episode=cfg.env.reward_terminates_episode
    )
    

In [22]:
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 10, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 10, 5]], 'wind': False, 'start': 'random', 'allowed_actions': ['L', 'R', 'U', 'D', 'UL', 'UR', 'DL', 'DR'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0.0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['min'], 'show_trajectory': False}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 2}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [23]:
cfg.freetime.num_steps = 100000
cfg.baseline.num_steps = 100000
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 10, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 10, 5]], 'wind': False, 'start': 'random', 'allowed_actions': ['L', 'R', 'U', 'D', 'UL', 'UR', 'DL', 'DR'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0.0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['min'], 'show_trajectory': False}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 2}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [31]:
from lib.algorithms import build_q_table

def grid_search(initialization):
    
    results = {}
    for tolerance in [-0.01, -0.001, 0.0, 0.001, 0.01, 0.1, 0.5]:
        
        results[tolerance] = []
        
        for exp in range(5):
            
            Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
            
            Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                tolerance
            )
            
            results[tolerance].append(rewards[-1])
    # Baseline
    results['baseline'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, rewards = Q_learn(
                env, 
                Q, 
                cfg.baseline.num_steps, 
                cfg.baseline.epsilon, 
                cfg.baseline.discount, 
                cfg.baseline.alpha
            )
        
        results['baseline'].append(rewards[-1])

    # Ratio
    results['ratio'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                cfg.offset
            )
        
        results['ratio'].append(rewards[-1])
        
    results = {k: sum(v)/len(v) for k, v in results.items()} 
    return results


In [32]:
results = grid_search('pessimistic')
results

100%|██████████| 100000/100000 [00:03<00:00, 28244.27it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28164.65it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28324.33it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28268.24it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28073.64it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28514.30it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28587.72it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28412.93it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28344.42it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28300.26it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29780.65it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29674.52it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29727.49it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29551.63it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29634.91it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29434.10it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29447.12it/

{-0.01: 8507.6,
 -0.001: 8656.0,
 0.0: 8416.0,
 0.001: 1289.4,
 0.01: 792.6,
 0.1: 425.4,
 0.5: 284.4,
 'baseline': 7941.4,
 'ratio': 1265.6}

In [27]:
results = grid_search('random')
results

100%|██████████| 100000/100000 [00:03<00:00, 28196.44it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27846.73it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28332.36it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28856.05it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28931.24it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29193.30it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28152.74it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28653.31it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29040.57it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28706.82it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28952.21it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29193.24it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28785.39it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29040.58it/s]
100%|██████████| 100000/100000 [00:03<00:00, 29142.21it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28636.89it/s]
100%|██████████| 100000/100000 [00:03<00:00, 28437.19it/

{-0.01: 4415.6,
 -0.001: 4730.9,
 0.0: 4421.0,
 0.001: 4284.5,
 0.01: 4841.0,
 0.1: 3501.1,
 0.5: 4738.0,
 'baseline': 528.0,
 'ratio': 3445.4}

In [28]:
results = grid_search('optimistic')
results

100%|██████████| 100000/100000 [00:03<00:00, 27528.33it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27490.47it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27608.20it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27638.75it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27842.85it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27665.54it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27585.33it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27742.35it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27854.50it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27220.83it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27753.92it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27707.73it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27742.35it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27870.04it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27287.74it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27280.29it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27437.62it/

{-0.01: 1160.9,
 -0.001: 1163.1,
 0.0: 1164.4,
 0.001: 1161.7,
 0.01: 1163.6,
 0.1: 1163.3,
 0.5: 1162.9,
 'baseline': 1161.0,
 'ratio': 1162.0}