In [1]:
from lib.algorithms import Q_learn_freetime, Q_learn

In [2]:
import hydra
hydra.initialize(config_path='configs')
cfg = hydra.compose(config_name='config')


In [3]:
cfg = hydra.compose(config_name='config')
from lib.gym_windy_gridworld import WindyGridworld
env = WindyGridworld(
        height=cfg.env.height, 
        width=cfg.env.width, 
        rewards=list(cfg.env.rewards), 
        wind=cfg.env.wind, 
        start=cfg.env.start, 
        allowed_actions=list(cfg.env.allowed_actions), 
        reward_terminates_episode=cfg.env.reward_terminates_episode
    )
    

In [4]:
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 1, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 0, 0], [2, 0, 11]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['max'], 'show_trajectory': False}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 2}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [5]:
cfg.freetime.num_steps = 100000
cfg.baseline.num_steps = 100000
cfg

{'initializations': ['random', 'pessimistic', 'optimistic'], 'random_initialization_seed': None, 'num_runs': 1, 'offset': 0.001, 'env': {'height': 20, 'width': 11, 'rewards': [[1, 0, 0], [2, 0, 11]], 'wind': True, 'start': 'random', 'allowed_actions': ['L', 'R', 'C'], 'reward_terminates_episode': True}, 'baseline': {'discount': 0.98, 'alpha': 0.01, 'num_steps': 100000, 'epsilon': 0.05, 'show_rewards': True, 'show_q': True, 'show_trajectory': False}, 'freetime': {'num_steps': 100000, 'epsilon': 0.05, 'discount': 0.98, 'alpha': 0.01, 'alpha_f': 0.01, 'tolerance': 0, 'show_rewards': True, 'show_q': True, 'show_f': False, 'show_f_actions': ['max'], 'show_trajectory': False}, 'trajectory_maps': {'num_plots': 2}, 'q_plots': {'vmin': 0, 'vmax': 2}, 'f_plots': {'vmin': None, 'vmax': None}, 'plot_freetime_vs_baseline_same_table': True}

In [6]:
from lib.algorithms import build_q_table

def grid_search(initialization):
    
    results = {}
    for tolerance in [-0.01, -0.001, 0.0, 0.001, 0.01, 0.1, 0.5]:
        
        results[tolerance] = []
        
        for exp in range(10):
            
            Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
            
            Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                cfg.freetime.tolerance
            )
            
            results[tolerance].append(rewards[-1])
    # Baseline
    results['baseline'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, rewards = Q_learn(
                env, 
                Q, 
                cfg.baseline.num_steps, 
                cfg.baseline.epsilon, 
                cfg.baseline.discount, 
                cfg.baseline.alpha
            )
        
        results['baseline'].append(rewards[-1])

    # Ratio
    results['ratio'] = []
    for exp in range(5):
        
        Q = build_q_table(
                (env.height, env.width),                
                env.action_space.n, 
                initialization = initialization,
                seed = cfg.random_initialization_seed # type: ignore
            )
        
        Q, F, rewards, _ = Q_learn_freetime(
                env, 
                Q, 
                cfg.freetime.num_steps, 
                cfg.freetime.epsilon, 
                cfg.freetime.discount, 
                cfg.freetime.alpha, 
                cfg.freetime.alpha_f, 
                cfg.offset
            )
        
        results['ratio'].append(rewards[-1])
        
    results = {k: sum(v)/len(v) for k, v in results.items()} 
    return results


In [7]:
results = grid_search('pessimistic')
results

100%|██████████| 100000/100000 [00:04<00:00, 24446.80it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24072.79it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23891.46it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24509.77it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24378.21it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24318.87it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24363.35it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24461.76it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24372.26it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24600.29it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24657.97it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24777.20it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24746.52it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23656.71it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23897.18it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24375.24it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24345.53it/

{-0.01: 6879.1,
 -0.001: 5731.0,
 0.0: 6729.3,
 0.001: 5389.2,
 0.01: 5932.3,
 0.1: 6162.1,
 0.5: 6234.4,
 'baseline': 4498.4,
 'ratio': 5180.4}

In [8]:
results = grid_search('random')
results

100%|██████████| 100000/100000 [00:04<00:00, 22749.11it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23189.99it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23198.07it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23542.43it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22905.56it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23098.85it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22640.85it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22676.82it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22712.91it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23096.18it/s]
100%|██████████| 100000/100000 [00:04<00:00, 21610.05it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22222.89it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22487.98it/s]
100%|██████████| 100000/100000 [00:04<00:00, 22633.16it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23173.85it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23257.47it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23195.38it/

{-0.01: 3510.0,
 -0.001: 4094.7,
 0.0: 4628.5,
 0.001: 3726.3,
 0.01: 3628.1,
 0.1: 3849.2,
 0.5: 4604.9,
 'baseline': 1931.0,
 'ratio': 4243.6}

In [9]:
results = grid_search('optimistic')
results

100%|██████████| 100000/100000 [00:04<00:00, 23542.44it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23503.66it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23454.01it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23547.98it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23581.33it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23437.51it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23152.38it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23777.74it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23050.89it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23908.61it/s]
100%|██████████| 100000/100000 [00:04<00:00, 23721.29it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24006.27it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24277.51it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24154.26it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24029.36it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24180.56it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24113.45it/

{-0.01: 7159.1,
 -0.001: 7042.6,
 0.0: 7077.6,
 0.001: 7077.0,
 0.01: 7056.6,
 0.1: 7063.7,
 0.5: 7008.9,
 'baseline': 7131.6,
 'ratio': 7061.6}