In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os

from scipy import stats
from env import SlitEnv
from q_table import QTable
from ensembles import TDWAverageEnsemble, TDWVoteEnsemble
from ensembles import Single, AverageEnsemble, VoteEnsemble
from train import train, evaluate_ensemble, evaluate_combinations, evaluate_or_load

In [None]:
EPSILON = 0.3
STEPS = 1000000

env = SlitEnv(13, goal_reward=100.0, step_penalty=-0.1)

if not os.path.exists('q_table_slitenv_13x13.pkl'):
    q_tables  = [QTable(env.size ** 2, 4) for _ in range(10)]

    for i, q_table in enumerate(q_tables):
        steps, rewards = train(env, q_table, i, STEPS, EPSILON)
        plt.plot(steps, rewards, label='table{}'.format(i))
    plt.title('training reward')
    plt.legend()
    with open('q_table_slitenv_13x13.pkl', 'wb') as f:
        pickle.dump(q_tables, f)
else:
    with open('q_table_slitenv_13x13.pkl', 'rb') as f:
        q_tables = pickle.load(f)

In [None]:
q_tables[0].visualize()

In [None]:
q_tables[1].visualize()

In [None]:
q_tables[2].visualize()

In [None]:
q_tables[3].visualize()

In [None]:
q_tables[4].visualize()

In [None]:
q_tables[5].visualize()

In [None]:
q_tables[6].visualize()

In [None]:
q_tables[7].visualize()

In [None]:
q_tables[8].visualize()

In [None]:
q_tables[9].visualize()

In [None]:
avg10_rewards = evaluate_or_load(10, 'slitenv', 'average10', env, q_tables, lambda x: AverageEnsemble(x),
                                                               10, num_episodes=20000, epsilon=0.1)

vote10_rewards = evaluate_or_load(10, 'slitenv', 'vote10', env, q_tables, lambda x: VoteEnsemble(x),
                                                                 10, num_episodes=20000, epsilon=0.1)

tdw_avg10_rewards_by_decay= []
for i in range(5):    
    rewards = evaluate_or_load(10, 'slitenv', 'tdw_avg10_{}'.format(i*2), env, q_tables,
                                                      lambda x: TDWAverageEnsemble(x, decay=i * 0.2), 10,
                                                       num_episodes=20000, epsilon=0.1)
    tdw_avg10_rewards_by_decay.append(rewards)

tdw_vote10_rewards_by_decay= []
for i in range(5):    
    rewards = evaluate_or_load(10, 'slitenv', 'tdw_vote10_{}'.format(i*2), env, q_tables,
                                                      lambda x: TDWVoteEnsemble(x, decay=i * 0.2), 10,
                                                       num_episodes=20000, epsilon=0.1)
    tdw_vote10_rewards_by_decay.append(rewards)

In [None]:
print('average (N=10)', np.mean(avg10_rewards), '({})'.format(np.std(avg10_rewards)))
for i in range(5):
    decay = i * 0.2
    print('tdw average (N=10, decay={})'.format(decay),
              np.mean(tdw_avg10_rewards_by_decay[i]),
              '({})'.format(np.std(tdw_avg10_rewards_by_decay[i])))

In [None]:
print('vote (N=10)', np.mean(vote10_rewards), '({})'.format(np.std(vote10_rewards)))
for i in range(5):
    decay = i * 0.2
    print('tdw vote (N=10, decay={})'.format(decay),
              np.mean(tdw_vote10_rewards_by_decay[i]),
              '({})'.format(np.std(tdw_vote10_rewards_by_decay[i])))

In [None]:
single_rewards = evaluate_or_load(1, 'slitenv', 'single', env, q_tables, lambda x: Single(x),
                                                               10, num_episodes=20000, epsilon=0.1)
print('single', np.mean(single_rewards), '({})'.format(np.std(single_rewards)))