Connected to PRL (Python 3.11.0)

In [2]:
# Define the intervals for gamma and reward shaping factor
gamma_interval = [0.93, 0.99]
reward_shaping_interval = [0.5, 0.9]
peakvalley = [True, False]
# #battery_factor_interval = [0.0, 0.15]

# # Define the number of iterations for the random search
num_iterations = 100

for i in range(num_iterations):
    
    # Generate random values within the intervals
    gamma = round(np.random.uniform(*gamma_interval), 4)
    factor = round(np.random.uniform(*reward_shaping_interval), 4)
    pv = random.choice(peakvalley)
    battery_factor = None #round(np.random.uniform(*battery_factor_interval), 4)
    
    print(f'\nIteration: {i+1}, Gamma: {gamma}, Reward Shaping Factor: {factor}, Peak and Valley: {pv}')
        
    seed = 2705
    rep = 105000 * 3
    batch_size = 48
    gamma = gamma
    epsilon = 1.0
    epsilon_decay = 99999
    epsilon_min = 0.1
    learning_rate = 5e-5
    price_horizon = 48
    future_horizon = 0
    hidden_dim = 128
    num_layers = 4
    positions = False
    action_classes = 3
    reward_shaping = True
    implicit_shape = True
    factor = factor
    verbose = False
    normalize = True
    df = train_name


    # Initialize Environment
    env = Electric_Car(path_to_test_data=df)
    val_env = Electric_Car(path_to_test_data=val_name)

    #Initialize DQN
    agent = DDQNAgent(env = env,
                    features = features_train,
                    epsilon_decay = epsilon_decay,
                    epsilon_start = epsilon,
                    epsilon_end = epsilon_min,
                    discount_rate = gamma,
                    lr = learning_rate,
                    buffer_size = 100000,
                    price_horizon = price_horizon,
                    hidden_dim=hidden_dim,
                    num_layers = num_layers,
                    positions = positions,
                    action_classes = action_classes, 
                    reward_shaping = reward_shaping,
                    implicit_shape = implicit_shape,
                    shaping_factor = factor,
                    normalize = normalize,
                    verbose = verbose)

    val_agent = DDQNAgent(env = val_env,
                    features = features_val,
                    epsilon_decay = epsilon_decay,
                    epsilon_start = epsilon,
                    epsilon_end = epsilon_min,
                    discount_rate = gamma,
                    lr = learning_rate,
                    buffer_size = 100000,
                    price_horizon = price_horizon,
                    hidden_dim=hidden_dim,
                    num_layers = num_layers,
                    positions = positions,
                    action_classes = action_classes, 
                    reward_shaping = reward_shaping,
                    implicit_shape = implicit_shape,
                    shaping_factor = factor,
                    normalize = normalize,
                    verbose = verbose)


    episode_balance = 0
    episode_loss = 0
    episode_counter = 0
    episode_reward = 0

    obs,_,_,_,_ = env.step(0) # First do nothing
    state = agent.obs_to_state(obs)
    
    for i in tqdm(range(rep)):

        action, q = agent.choose_action(i, state, greedy = False) # Choose action (discrete)
        cont_action = agent.action_to_cont(action) # Convert to continuous action
        
        # Get new observation and state
        new_obs, r, t, _, _ = env.step(cont_action)
        new_state = agent.obs_to_state(new_obs)
        
        # Reward Shaping            
        new_reward = agent.shape_reward(r, cont_action, peakvalley = pv)

        # Fill replay buffer - THIS IS THE ONLY THING WE DO WITH THE CURRENT OBSERVATION - LEARNING IS FULLY PERFORMED FROM THE REPLAY BUFFER
        if state.shape[0] == agent.state_dim and new_state.shape[0] == agent.state_dim:
            agent.replay_memory.add_data((state, action, new_reward, t, new_state))

        #Update DQN
        loss = agent.optimize(batch_size)
        
        # Update values
        episode_balance += r
        episode_reward += r
        episode_loss += loss

        # New observation
        state = new_state        

        if t:
            # Reset Environment
            env.counter = 0
            env.hour = 1
            env.day = 1
            episode_counter += 1
            
            if episode_counter % 4 == 0:
                print('Episode ', episode_counter, 'Balance: ', episode_balance, 'Reward: ', episode_reward, 'Loss: ', episode_loss) # Add both balance and reward to see how training objective and actually spent money differ
            
            # Scheduler Step
            agent.scheduler.step(episode_loss)
            
            episode_loss = 0
            episode_balance = 0
            episode_reward = 0
            
            
            if episode_counter % 4 == 0:
                # Evaluate DQN - Training
                train_dqn = DDQNEvaluation(price_horizon = price_horizon)
                _ = train_dqn.evaluate(agent = agent)
                
                # Evaluate DQN - Validation
                val_agent.dqn_predict.load_state_dict(agent.dqn_predict.state_dict())
                val_dqn = DDQNEvaluation(price_horizon = price_horizon)
                balance = val_dqn.evaluate(agent = val_agent, validation = True)
                
                # Reset Environments after evaluation
                env.counter = 0
                env.hour = 1
                env.day = 1
                val_env.counter = 0
                val_env.hour = 1
                val_env.day = 1
                
                if balance < -650:
                    print("Balance too low, stopping training")
                    #break
            
                
    # Save agent
    torch.save(agent.dqn_predict.state_dict(), f'models/change_tuning_gamma_{gamma}_valley_{pv}.pt')


Iteration: 1, Gamma: 0.949, Reward Shaping Factor: 0.5818, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104153/315000 [06:13<21:42, 161.84it/s]

Episode  4 Balance:  -2180.5101852469056 Reward:  -2180.5101852469056 Loss:  1177.8178064418025
Training Balance:  -443.1112687037038


 33%|███▎      | 104188/315000 [06:36<15:45:17,  3.72it/s]

Validation Balance:  -783.6901994444445
Balance too low, stopping training


 66%|██████▋   | 209367/315000 [12:23<05:04, 346.66it/s]  

Episode  8 Balance:  -1101.418363456792 Reward:  -1101.418363456792 Loss:  505.7090314544039
Training Balance:  -68.09377740740749


 66%|██████▋   | 209402/315000 [12:39<4:11:31,  7.00it/s]

Validation Balance:  -620.8837809259261


100%|█████████▉| 314555/315000 [17:40<00:01, 361.26it/s] 

Episode  12 Balance:  -849.5615846296276 Reward:  -849.5615846296276 Loss:  560.3405727790669
Training Balance:  -85.00176055555568


100%|█████████▉| 314620/315000 [17:54<00:33, 11.34it/s] 

Validation Balance:  -606.9081188271606


100%|██████████| 315000/315000 [17:55<00:00, 292.86it/s]



Iteration: 2, Gamma: 0.9802, Reward Shaping Factor: 0.7581, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104146/315000 [05:18<11:22, 308.87it/s]

Episode  4 Balance:  -1744.4806950000013 Reward:  -1744.4806950000013 Loss:  1212.7783494386822
Training Balance:  -195.88523111111118


 33%|███▎      | 104210/315000 [05:32<5:25:58, 10.78it/s]

Validation Balance:  -596.8386060493827


 66%|██████▋   | 209344/315000 [10:33<04:54, 358.20it/s] 

Episode  8 Balance:  -994.9259036419732 Reward:  -994.9259036419732 Loss:  587.5120902776252
Training Balance:  -181.1113624074075


 66%|██████▋   | 209406/315000 [10:47<2:41:34, 10.89it/s]

Validation Balance:  -416.32169919753096


 92%|█████████▏| 288354/315000 [14:32<01:13, 360.98it/s] 

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 314559/315000 [15:46<00:01, 363.43it/s]

Episode  12 Balance:  -824.4581128395064 Reward:  -824.4581128395064 Loss:  734.1033428343944
Training Balance:  -82.38569283950629


100%|█████████▉| 314629/315000 [16:00<00:31, 11.88it/s] 

Validation Balance:  -569.8001912962964


100%|██████████| 315000/315000 [16:01<00:00, 327.51it/s]



Iteration: 3, Gamma: 0.9511, Reward Shaping Factor: 0.5356, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104142/315000 [05:19<11:23, 308.57it/s]

Episode  4 Balance:  -1977.4801751234631 Reward:  -1977.4801751234631 Loss:  937.8917705323547
Training Balance:  -117.97411851851855


 33%|███▎      | 104197/315000 [05:33<5:48:36, 10.08it/s]

Validation Balance:  -714.6211627777777
Balance too low, stopping training


 66%|██████▋   | 209352/315000 [10:30<04:58, 354.20it/s] 

Episode  8 Balance:  -933.6940392592583 Reward:  -933.6940392592583 Loss:  405.4375205823453
Training Balance:  128.04310999999996


 66%|██████▋   | 209406/315000 [10:47<3:11:07,  9.21it/s]

Validation Balance:  -645.0540654938272


100%|█████████▉| 314584/315000 [15:45<00:01, 363.25it/s] 

Episode  12 Balance:  -813.1753206790119 Reward:  -813.1753206790119 Loss:  413.01297138573136
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  112.42040722222221


100%|█████████▉| 314654/315000 [15:59<00:29, 11.76it/s] 

Validation Balance:  -671.054718580247
Balance too low, stopping training


100%|██████████| 315000/315000 [16:00<00:00, 327.83it/s]



Iteration: 4, Gamma: 0.9591, Reward Shaping Factor: 0.6656, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104154/315000 [05:16<11:07, 315.94it/s]

Episode  4 Balance:  -2086.1081319753157 Reward:  -2086.1081319753157 Loss:  1023.455471070949
Training Balance:  -152.8833396913581


 33%|███▎      | 104195/315000 [05:30<6:24:26,  9.14it/s]

Validation Balance:  -636.608418888889


 66%|██████▋   | 209341/315000 [10:26<06:00, 292.79it/s] 

Episode  8 Balance:  -842.4136930864189 Reward:  -842.4136930864189 Loss:  436.03786924737506
Training Balance:  -9.045496913580333


 66%|██████▋   | 209409/315000 [10:41<2:32:23, 11.55it/s]

Validation Balance:  -551.1249981481483


100%|█████████▉| 314551/315000 [15:38<00:01, 367.94it/s] 

Episode  12 Balance:  -722.3539076543215 Reward:  -722.3539076543215 Loss:  491.0308377936017
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  153.05809209876534


100%|█████████▉| 314621/315000 [15:52<00:31, 11.85it/s] 

Validation Balance:  -592.7727314814815


100%|██████████| 315000/315000 [15:53<00:00, 330.22it/s]



Iteration: 5, Gamma: 0.9697, Reward Shaping Factor: 0.7412, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104137/315000 [05:21<11:52, 295.94it/s]

Episode  4 Balance:  -1933.0626536419736 Reward:  -1933.0626536419736 Loss:  1697.1100215977058
Training Balance:  -139.80065500000003


 33%|███▎      | 104199/315000 [05:35<5:29:03, 10.68it/s]

Validation Balance:  -658.3714483333334
Balance too low, stopping training


 66%|██████▋   | 209368/315000 [10:32<04:47, 367.34it/s] 

Episode  8 Balance:  -1128.2184925308636 Reward:  -1128.2184925308636 Loss:  803.5648263373878
Training Balance:  -118.45101740740752


 66%|██████▋   | 209408/315000 [10:46<3:01:24,  9.70it/s]

Validation Balance:  -789.8198223456792
Balance too low, stopping training


100%|█████████▉| 314561/315000 [15:43<00:01, 364.04it/s] 

Episode  12 Balance:  -1035.739761358023 Reward:  -1035.739761358023 Loss:  983.006367596332
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  -190.08813388888893


100%|█████████▉| 314620/315000 [15:57<00:34, 10.99it/s] 

Validation Balance:  -786.4129423456792
Balance too low, stopping training


100%|██████████| 315000/315000 [15:59<00:00, 328.44it/s]



Iteration: 6, Gamma: 0.9372, Reward Shaping Factor: 0.7441, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104147/315000 [05:17<11:14, 312.43it/s]

Episode  4 Balance:  -2134.066804382709 Reward:  -2134.066804382709 Loss:  1518.261083940044
Training Balance:  -320.2682543209877


 33%|███▎      | 104212/315000 [05:31<5:24:49, 10.82it/s]

Validation Balance:  -757.118748888889
Balance too low, stopping training


 66%|██████▋   | 209374/315000 [10:30<05:05, 345.82it/s] 

Episode  8 Balance:  -967.6919030864182 Reward:  -967.6919030864182 Loss:  580.5513503055554
Training Balance:  -93.27885950617295


 66%|██████▋   | 209409/315000 [10:44<3:27:52,  8.47it/s]

Validation Balance:  -800.051497654321
Balance too low, stopping training


100%|█████████▉| 314583/315000 [15:39<00:01, 289.85it/s] 

Episode  12 Balance:  -987.0872946913556 Reward:  -987.0872946913556 Loss:  579.549884889042
Training Balance:  -65.20125820987666


100%|█████████▉| 314645/315000 [15:54<00:36,  9.60it/s] 

Validation Balance:  -795.3970389506173
Balance too low, stopping training


100%|██████████| 315000/315000 [15:55<00:00, 329.51it/s]



Iteration: 7, Gamma: 0.9758, Reward Shaping Factor: 0.8285, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104137/315000 [05:17<12:40, 277.37it/s]

Episode  4 Balance:  -1816.7432270987645 Reward:  -1816.7432270987645 Loss:  1826.470052169636
Training Balance:  -153.07766055555567


 33%|███▎      | 104195/315000 [05:34<7:11:42,  8.14it/s] 

Validation Balance:  -709.4947722222223
Balance too low, stopping training


 66%|██████▋   | 209349/315000 [10:28<04:50, 363.73it/s] 

Episode  8 Balance:  -1006.3764449999991 Reward:  -1006.3764449999991 Loss:  948.7128087361343
Training Balance:  -180.77879271604948


 66%|██████▋   | 209406/315000 [10:43<2:51:38, 10.25it/s]

Validation Balance:  -621.11067154321


100%|█████████▉| 314554/315000 [15:40<00:01, 349.88it/s] 

Episode  12 Balance:  -899.8845483333317 Reward:  -899.8845483333317 Loss:  1149.0108559420332
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  -197.0875495061729


100%|█████████▉| 314623/315000 [15:54<00:32, 11.54it/s] 

Validation Balance:  -674.8964372222223
Balance too low, stopping training


100%|██████████| 315000/315000 [15:55<00:00, 329.57it/s]



Iteration: 8, Gamma: 0.956, Reward Shaping Factor: 0.6522, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104143/315000 [05:21<11:18, 310.83it/s]

Episode  4 Balance:  -2093.1799966666676 Reward:  -2093.1799966666676 Loss:  1404.8640456022695
Training Balance:  -289.73425666666674


 33%|███▎      | 104197/315000 [05:35<5:57:04,  9.84it/s]

Validation Balance:  -706.3594700000001
Balance too low, stopping training


 66%|██████▋   | 209344/315000 [10:33<04:50, 364.32it/s] 

Episode  8 Balance:  -987.6504513580239 Reward:  -987.6504513580239 Loss:  651.9523127852008
Training Balance:  -62.767657037037154


 66%|██████▋   | 209415/315000 [10:48<2:30:27, 11.70it/s]

Validation Balance:  -760.6207167901235
Balance too low, stopping training


100%|█████████▉| 314559/315000 [15:55<00:01, 364.73it/s] 

Episode  12 Balance:  -883.5619846913584 Reward:  -883.5619846913584 Loss:  634.648729699431
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  -166.26710216049392


100%|█████████▉| 314629/315000 [16:09<00:31, 11.76it/s] 

Validation Balance:  -754.1230069753087
Balance too low, stopping training


100%|██████████| 315000/315000 [16:10<00:00, 324.47it/s]



Iteration: 9, Gamma: 0.9604, Reward Shaping Factor: 0.5192, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104152/315000 [05:21<11:33, 303.98it/s]

Episode  4 Balance:  -2062.9958070370353 Reward:  -2062.9958070370353 Loss:  1197.109889306128
Training Balance:  -238.83089265432102


 33%|███▎      | 104216/315000 [05:35<5:27:30, 10.73it/s]

Validation Balance:  -760.600193888889
Balance too low, stopping training


 66%|██████▋   | 209371/315000 [10:33<05:18, 331.65it/s] 

Episode  8 Balance:  -1022.9608001234581 Reward:  -1022.9608001234581 Loss:  562.854763836367
Training Balance:  -98.20831814814825


 66%|██████▋   | 209432/315000 [10:47<2:50:34, 10.32it/s]

Validation Balance:  -736.6379801851853
Balance too low, stopping training


100%|█████████▉| 314571/315000 [15:46<00:01, 321.77it/s] 

Episode  12 Balance:  -892.5645985802469 Reward:  -892.5645985802469 Loss:  650.1799488947727
Training Balance:  -53.94073506172849


100%|█████████▉| 314628/315000 [16:00<00:35, 10.55it/s] 

Validation Balance:  -730.6999000000001
Balance too low, stopping training


100%|██████████| 315000/315000 [16:02<00:00, 327.44it/s]



Iteration: 10, Gamma: 0.9444, Reward Shaping Factor: 0.742, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104132/315000 [05:17<11:05, 316.64it/s]

Episode  4 Balance:  -2105.211514814814 Reward:  -2105.211514814814 Loss:  1014.7929390701465
Training Balance:  -185.30211740740748


 33%|███▎      | 104196/315000 [05:30<5:22:42, 10.89it/s]

Validation Balance:  -658.4269266666666
Balance too low, stopping training


 66%|██████▋   | 209366/315000 [10:28<04:51, 362.24it/s] 

Episode  8 Balance:  -826.7094959876536 Reward:  -826.7094959876536 Loss:  351.0441897313576
Training Balance:  -23.48924104938274


 66%|██████▋   | 209435/315000 [10:42<2:30:02, 11.73it/s]

Validation Balance:  -565.255570617284


100%|█████████▉| 314575/315000 [15:42<00:01, 361.19it/s] 

Episode  12 Balance:  -614.6368988888892 Reward:  -614.6368988888892 Loss:  386.35791041201446
Training Balance:  163.26056851851848


100%|█████████▉| 314645/315000 [15:57<00:32, 10.76it/s] 

Validation Balance:  -567.0954217283951


100%|██████████| 315000/315000 [15:58<00:00, 328.50it/s]



Iteration: 11, Gamma: 0.9584, Reward Shaping Factor: 0.6148, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104157/315000 [05:19<11:11, 314.03it/s]

Episode  4 Balance:  -1911.0439403703745 Reward:  -1911.0439403703745 Loss:  1029.853808587417
Training Balance:  -100.73789148148151


 33%|███▎      | 104221/315000 [05:33<5:25:57, 10.78it/s]

Validation Balance:  -655.2671657407408
Balance too low, stopping training


 66%|██████▋   | 209347/315000 [10:33<04:50, 363.88it/s] 

Episode  8 Balance:  -665.8345395679 Reward:  -665.8345395679 Loss:  507.7072165030986
Training Balance:  40.15549839506167


 66%|██████▋   | 209416/315000 [10:47<2:30:52, 11.66it/s]

Validation Balance:  -551.9136629012346


 92%|█████████▏| 288347/315000 [14:28<01:12, 365.38it/s] 

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 314578/315000 [15:44<00:01, 350.45it/s]

Episode  12 Balance:  -666.6853272839513 Reward:  -666.6853272839513 Loss:  456.2289116870379
Training Balance:  164.47294487654315


100%|█████████▉| 314620/315000 [15:59<00:39,  9.50it/s] 

Validation Balance:  -597.9151786419754


100%|██████████| 315000/315000 [16:00<00:00, 327.91it/s]



Iteration: 12, Gamma: 0.9308, Reward Shaping Factor: 0.5006, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104155/315000 [05:16<12:25, 282.78it/s]

Episode  4 Balance:  -2230.9315406172855 Reward:  -2230.9315406172855 Loss:  954.8252269625664
Training Balance:  -335.9656621604938


 33%|███▎      | 104196/315000 [05:30<7:02:15,  8.32it/s] 

Validation Balance:  -753.0108735185187
Balance too low, stopping training


 66%|██████▋   | 209370/315000 [10:26<04:55, 357.19it/s] 

Episode  8 Balance:  -982.4612381481477 Reward:  -982.4612381481477 Loss:  277.5365701196715
Training Balance:  -18.424904691358066


 66%|██████▋   | 209406/315000 [10:40<3:16:25,  8.96it/s]

Validation Balance:  -663.8338661111112
Balance too low, stopping training


100%|█████████▉| 314560/315000 [15:40<00:01, 323.61it/s] 

Episode  12 Balance:  -1049.4150634567902 Reward:  -1049.4150634567902 Loss:  325.35591807879973
Training Balance:  -4.23942228395061


100%|█████████▉| 314625/315000 [15:54<00:34, 10.96it/s] 

Validation Balance:  -600.5673740123457


100%|██████████| 315000/315000 [15:55<00:00, 329.70it/s]



Iteration: 13, Gamma: 0.969, Reward Shaping Factor: 0.5856, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104147/315000 [05:21<11:22, 309.09it/s]

Episode  4 Balance:  -1920.9654272839575 Reward:  -1920.9654272839575 Loss:  1387.3301120037213
Training Balance:  -238.9459898148149


 33%|███▎      | 104210/315000 [05:35<5:28:44, 10.69it/s]

Validation Balance:  -704.4974501234569
Balance too low, stopping training


 66%|██████▋   | 209350/315000 [10:35<04:53, 360.33it/s] 

Episode  8 Balance:  -898.9753306172823 Reward:  -898.9753306172823 Loss:  670.6130928865168
Training Balance:  -61.44081777777789


 66%|██████▋   | 209420/315000 [10:50<2:31:07, 11.64it/s]

Validation Balance:  -770.2821367283952
Balance too low, stopping training


100%|█████████▉| 314550/315000 [15:48<00:01, 360.55it/s] 

Episode  12 Balance:  -954.6746887654327 Reward:  -954.6746887654327 Loss:  719.7019036472775
Training Balance:  -130.55298006172848


100%|█████████▉| 314621/315000 [16:03<00:32, 11.68it/s] 

Validation Balance:  -749.5425936419754
Balance too low, stopping training


100%|██████████| 315000/315000 [16:04<00:00, 326.58it/s]



Iteration: 14, Gamma: 0.9855, Reward Shaping Factor: 0.5547, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104132/315000 [05:17<11:37, 302.37it/s]

Episode  4 Balance:  -1735.2850269135852 Reward:  -1735.2850269135852 Loss:  1104.403042848222
Training Balance:  -154.65345493827166


 33%|███▎      | 104193/315000 [05:31<5:30:13, 10.64it/s]

Validation Balance:  -642.3019192592594


 66%|██████▋   | 209347/315000 [10:29<04:50, 364.02it/s] 

Episode  8 Balance:  -768.9357691358039 Reward:  -768.9357691358039 Loss:  537.1712774406187
Training Balance:  -92.44967623456799


 66%|██████▋   | 209416/315000 [10:44<2:34:15, 11.41it/s]

Validation Balance:  -577.6063019135803


 92%|█████████▏| 288354/315000 [14:32<01:13, 364.89it/s] 

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 314551/315000 [15:46<00:01, 364.76it/s]

Episode  12 Balance:  -721.2911608024687 Reward:  -721.2911608024687 Loss:  718.7997563611716
Training Balance:  51.19453598765426


100%|█████████▉| 314620/315000 [16:00<00:32, 11.63it/s] 

Validation Balance:  -557.7699688888889


100%|██████████| 315000/315000 [16:02<00:00, 327.40it/s]



Iteration: 15, Gamma: 0.9322, Reward Shaping Factor: 0.6789, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104137/315000 [05:17<11:02, 318.36it/s]

Episode  4 Balance:  -2109.058003456793 Reward:  -2109.058003456793 Loss:  937.0214970768429
Training Balance:  -65.8424204938272


 33%|███▎      | 104201/315000 [05:31<5:21:52, 10.91it/s]

Validation Balance:  -664.3054305555555
Balance too low, stopping training


 66%|██████▋   | 209353/315000 [10:29<04:54, 359.24it/s] 

Episode  8 Balance:  -887.4374974691339 Reward:  -887.4374974691339 Loss:  314.43984797317535
Training Balance:  79.11103493827156


 66%|██████▋   | 209409/315000 [10:43<2:45:39, 10.62it/s]

Validation Balance:  -680.6079725308643
Balance too low, stopping training


100%|█████████▉| 314574/315000 [15:45<00:01, 350.54it/s] 

Episode  12 Balance:  -810.1427923456772 Reward:  -810.1427923456772 Loss:  346.4511285542976
Training Balance:  86.55038512345675


100%|█████████▉| 314641/315000 [15:59<00:31, 11.51it/s] 

Validation Balance:  -681.6619488888889
Balance too low, stopping training


100%|██████████| 315000/315000 [16:00<00:00, 327.99it/s]



Iteration: 16, Gamma: 0.9819, Reward Shaping Factor: 0.7321, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104155/315000 [05:18<11:14, 312.82it/s]

Episode  4 Balance:  -1756.6857409876602 Reward:  -1756.6857409876602 Loss:  1307.2126069972292
Training Balance:  -239.9334709876544


 33%|███▎      | 104193/315000 [05:35<8:30:26,  6.88it/s] 

Validation Balance:  -566.9915904320989


 66%|██████▋   | 209374/315000 [10:51<04:58, 354.41it/s] 

Episode  8 Balance:  -894.7687140740736 Reward:  -894.7687140740736 Loss:  608.2392751188017
Training Balance:  -188.67897320987663


 66%|██████▋   | 209408/315000 [11:05<3:14:24,  9.05it/s]

Validation Balance:  -350.8103215432099


 92%|█████████▏| 288344/315000 [14:51<01:19, 337.31it/s] 

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 314573/315000 [16:06<00:01, 341.35it/s]

Episode  12 Balance:  -749.0959431481471 Reward:  -749.0959431481471 Loss:  782.7021078965627
Training Balance:  -195.72321728395067


100%|█████████▉| 314613/315000 [16:22<00:50,  7.69it/s] 

Validation Balance:  -362.71954833333336


100%|██████████| 315000/315000 [16:23<00:00, 320.35it/s]



Iteration: 17, Gamma: 0.9467, Reward Shaping Factor: 0.7993, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104154/315000 [05:17<11:18, 310.70it/s]

Episode  4 Balance:  -1904.9916458642035 Reward:  -1904.9916458642035 Loss:  1609.9079369856045
Training Balance:  -175.0754438271606


 33%|███▎      | 104193/315000 [05:32<7:14:32,  8.09it/s] 

Validation Balance:  -683.3027813580248
Balance too low, stopping training


 66%|██████▋   | 209363/315000 [10:32<04:52, 360.95it/s] 

Episode  8 Balance:  -923.6601227777757 Reward:  -923.6601227777757 Loss:  624.929610762978
Training Balance:  -145.59960327160502


 66%|██████▋   | 209432/315000 [10:47<2:35:47, 11.29it/s]

Validation Balance:  -789.8142044444446
Balance too low, stopping training


100%|█████████▉| 314562/315000 [15:47<00:01, 353.46it/s] 

Episode  12 Balance:  -925.0747742592581 Reward:  -925.0747742592581 Loss:  724.3638043666724
Training Balance:  -144.39132777777783


100%|█████████▉| 314625/315000 [16:02<00:33, 11.16it/s] 

Validation Balance:  -813.3637137037038
Balance too low, stopping training


100%|██████████| 315000/315000 [16:03<00:00, 327.05it/s]



Iteration: 18, Gamma: 0.9664, Reward Shaping Factor: 0.7986, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104133/315000 [05:20<11:13, 313.16it/s]

Episode  4 Balance:  -1893.430453209884 Reward:  -1893.430453209884 Loss:  1801.077041035518
Training Balance:  -154.80303765432106


 33%|███▎      | 104228/315000 [05:34<4:01:16, 14.56it/s]

Validation Balance:  -741.3848861728395
Balance too low, stopping training


 66%|██████▋   | 209346/315000 [10:40<04:53, 360.02it/s] 

Episode  8 Balance:  -1034.7137370370365 Reward:  -1034.7137370370365 Loss:  716.6279934062622
Training Balance:  -200.5784443827162


 66%|██████▋   | 209408/315000 [10:55<2:36:07, 11.27it/s]

Validation Balance:  -721.7455557407409
Balance too low, stopping training


100%|█████████▉| 314562/315000 [15:58<00:01, 358.31it/s] 

Episode  12 Balance:  -1001.1919835802425 Reward:  -1001.1919835802425 Loss:  954.4246826125309
Training Balance:  -107.62140382716053


100%|█████████▉| 314630/315000 [16:12<00:32, 11.44it/s] 

Validation Balance:  -799.9721844444446
Balance too low, stopping training


100%|██████████| 315000/315000 [16:13<00:00, 323.54it/s]



Iteration: 19, Gamma: 0.9322, Reward Shaping Factor: 0.8131, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104152/315000 [05:19<11:18, 310.65it/s]

Episode  4 Balance:  -2157.8662911728406 Reward:  -2157.8662911728406 Loss:  1482.9891218743287
Training Balance:  -432.535610617284


 33%|███▎      | 104215/315000 [05:33<5:32:14, 10.57it/s]

Validation Balance:  -696.5518937037039
Balance too low, stopping training


 66%|██████▋   | 209369/315000 [10:34<05:06, 344.72it/s] 

Episode  8 Balance:  -1048.388896543209 Reward:  -1048.388896543209 Loss:  546.1350230349926
Training Balance:  -168.21418166666672


 66%|██████▋   | 209436/315000 [10:49<2:34:38, 11.38it/s]

Validation Balance:  -785.3247586419754
Balance too low, stopping training


100%|█████████▉| 314553/315000 [15:49<00:01, 364.62it/s] 

Episode  12 Balance:  -1039.8929302469128 Reward:  -1039.8929302469128 Loss:  575.5095169339329
Training Balance:  -109.00699006172846


100%|█████████▉| 314622/315000 [16:04<00:32, 11.58it/s] 

Validation Balance:  -775.1429052469138
Balance too low, stopping training


100%|██████████| 315000/315000 [16:05<00:00, 326.32it/s]



Iteration: 20, Gamma: 0.9466, Reward Shaping Factor: 0.6592, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104161/315000 [05:21<11:35, 303.25it/s]

Episode  4 Balance:  -1960.335702283953 Reward:  -1960.335702283953 Loss:  1034.3702676682733
Training Balance:  -150.57553549382726


 33%|███▎      | 104196/315000 [05:35<6:39:36,  8.79it/s] 

Validation Balance:  -651.1534661111112
Balance too low, stopping training


 66%|██████▋   | 209362/315000 [10:37<04:55, 356.99it/s] 

Episode  8 Balance:  -865.0958785185182 Reward:  -865.0958785185182 Loss:  420.10824450745713
Training Balance:  -16.93277370370378


 66%|██████▋   | 209429/315000 [10:51<2:36:16, 11.26it/s]

Validation Balance:  -601.5062902469136


100%|█████████▉| 314556/315000 [15:58<00:01, 357.16it/s] 

Episode  12 Balance:  -763.0660424074069 Reward:  -763.0660424074069 Loss:  420.1474975734018
Training Balance:  138.9454691975308


100%|█████████▉| 314623/315000 [16:12<00:33, 11.10it/s] 

Validation Balance:  -611.1176314197531


100%|██████████| 315000/315000 [16:13<00:00, 323.43it/s]



Iteration: 21, Gamma: 0.9319, Reward Shaping Factor: 0.5012, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104147/315000 [05:20<11:18, 310.94it/s]

Episode  4 Balance:  -2165.9355172839455 Reward:  -2165.9355172839455 Loss:  1156.2772632399574
Training Balance:  -362.36471666666677


 33%|███▎      | 104211/315000 [05:35<5:39:22, 10.35it/s]

Validation Balance:  -656.4825155555557
Balance too low, stopping training


 66%|██████▋   | 209368/315000 [10:37<05:24, 325.10it/s] 

Episode  8 Balance:  -985.025917222221 Reward:  -985.025917222221 Loss:  431.437026934349
Training Balance:  -40.97147179012351


 66%|██████▋   | 209408/315000 [10:53<3:30:23,  8.36it/s]

Validation Balance:  -671.4843513580248
Balance too low, stopping training


100%|█████████▉| 314556/315000 [15:54<00:01, 351.82it/s] 

Episode  12 Balance:  -1123.576044320987 Reward:  -1123.576044320987 Loss:  452.54061075893696
Training Balance:  -194.4647348765433


100%|█████████▉| 314623/315000 [16:08<00:33, 11.15it/s] 

Validation Balance:  -775.3221300000001
Balance too low, stopping training


100%|██████████| 315000/315000 [16:09<00:00, 324.86it/s]



Iteration: 22, Gamma: 0.9398, Reward Shaping Factor: 0.8062, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104154/315000 [05:24<11:17, 311.42it/s]

Episode  4 Balance:  -2025.9184807407478 Reward:  -2025.9184807407478 Loss:  1442.4737114049494
Training Balance:  -186.5221035185186


 33%|███▎      | 104196/315000 [05:38<6:20:19,  9.24it/s]

Validation Balance:  -746.2166548148148
Balance too low, stopping training


 66%|██████▋   | 209352/315000 [10:42<05:11, 339.27it/s] 

Episode  8 Balance:  -1039.4635887037032 Reward:  -1039.4635887037032 Loss:  643.4371407316066
Training Balance:  25.7576527777777


 66%|██████▋   | 209418/315000 [10:57<2:40:02, 11.00it/s]

Validation Balance:  -777.7745649382717
Balance too low, stopping training


100%|█████████▉| 314568/315000 [15:59<00:01, 358.96it/s] 

Episode  12 Balance:  -923.9554185802468 Reward:  -923.9554185802468 Loss:  634.2824503125157
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  -83.3449947530865


100%|█████████▉| 314629/315000 [16:13<00:34, 10.77it/s] 

Validation Balance:  -805.1295413580248
Balance too low, stopping training


100%|██████████| 315000/315000 [16:15<00:00, 323.06it/s]



Iteration: 23, Gamma: 0.9844, Reward Shaping Factor: 0.8926, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104148/315000 [05:19<11:12, 313.57it/s]

Episode  4 Balance:  -1698.285555185185 Reward:  -1698.285555185185 Loss:  1532.903451657854
Training Balance:  -504.16401209876545


 33%|███▎      | 104196/315000 [05:33<6:05:11,  9.62it/s]

Validation Balance:  -632.510332654321


 66%|██████▋   | 209353/315000 [10:35<04:53, 360.44it/s] 

Episode  8 Balance:  -970.432828888889 Reward:  -970.432828888889 Loss:  722.8975314230192
Training Balance:  -217.54310395061736


 66%|██████▋   | 209421/315000 [10:50<2:32:49, 11.51it/s]

Validation Balance:  -393.7420494444445


 92%|█████████▏| 288344/315000 [14:35<01:15, 352.31it/s] 

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 314579/315000 [15:49<00:01, 365.24it/s]

Episode  12 Balance:  -916.2119856790117 Reward:  -916.2119856790117 Loss:  940.8143347529694
Training Balance:  -239.9997653086421


100%|█████████▉| 314620/315000 [16:03<00:39,  9.68it/s] 

Validation Balance:  -404.2517625925927


100%|██████████| 315000/315000 [16:05<00:00, 326.39it/s]



Iteration: 24, Gamma: 0.9416, Reward Shaping Factor: 0.5495, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104161/315000 [05:27<11:09, 314.94it/s]

Episode  4 Balance:  -2169.361002283956 Reward:  -2169.361002283956 Loss:  1275.6688437927514
Training Balance:  -438.98290493827176


 33%|███▎      | 104224/315000 [05:41<5:33:39, 10.53it/s]

Validation Balance:  -691.508708888889
Balance too low, stopping training


 66%|██████▋   | 209356/315000 [10:45<04:53, 359.63it/s] 

Episode  8 Balance:  -1078.3860867901237 Reward:  -1078.3860867901237 Loss:  448.3529979661107
Training Balance:  -117.40424481481494


 66%|██████▋   | 209408/315000 [10:59<2:51:57, 10.23it/s]

Validation Balance:  -753.8255810493829
Balance too low, stopping training


100%|█████████▉| 314558/315000 [16:02<00:01, 355.33it/s] 

Episode  12 Balance:  -995.977195987653 Reward:  -995.977195987653 Loss:  457.78047191526275
Training Balance:  -91.28056913580254


100%|█████████▉| 314626/315000 [16:17<00:32, 11.43it/s] 

Validation Balance:  -724.1958883333334
Balance too low, stopping training


100%|██████████| 315000/315000 [16:18<00:00, 321.94it/s]



Iteration: 25, Gamma: 0.9443, Reward Shaping Factor: 0.6831, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104144/315000 [05:21<11:17, 311.23it/s]

Episode  4 Balance:  -2309.3691438888814 Reward:  -2309.3691438888814 Loss:  1395.078985943459
Training Balance:  -386.1258550000001


 33%|███▎      | 104208/315000 [05:35<5:32:05, 10.58it/s]

Validation Balance:  -757.3129411111112
Balance too low, stopping training


 66%|██████▋   | 209355/315000 [10:39<04:55, 357.95it/s] 

Episode  8 Balance:  -881.3702232716037 Reward:  -881.3702232716037 Loss:  548.3996617479715
Training Balance:  -91.96495814814821


 66%|██████▋   | 209422/315000 [10:54<2:33:56, 11.43it/s]

Validation Balance:  -751.6691904320988
Balance too low, stopping training


100%|█████████▉| 314572/315000 [16:02<00:01, 326.31it/s] 

Episode  12 Balance:  -983.6744236419754 Reward:  -983.6744236419754 Loss:  583.8833032189868
Training Balance:  -94.53131401234572


100%|█████████▉| 314636/315000 [16:16<00:34, 10.68it/s] 

Validation Balance:  -838.4320689506175
Balance too low, stopping training


100%|██████████| 315000/315000 [16:17<00:00, 322.21it/s]



Iteration: 26, Gamma: 0.9633, Reward Shaping Factor: 0.519, Peak and Valley: False
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 33%|███▎      | 104161/315000 [05:20<11:16, 311.80it/s]

Episode  4 Balance:  -1911.6058637037117 Reward:  -1911.6058637037117 Loss:  1013.4676247104071
Training Balance:  -82.465727962963


 33%|███▎      | 104224/315000 [05:34<5:30:41, 10.62it/s]

Validation Balance:  -670.6721755555556
Balance too low, stopping training


 66%|██████▋   | 209344/315000 [10:38<05:35, 314.78it/s] 

Episode  8 Balance:  -743.4641864814821 Reward:  -743.4641864814821 Loss:  474.6999328506645
Training Balance:  92.13360975308633


 66%|██████▋   | 209407/315000 [10:53<2:55:47, 10.01it/s]

Validation Balance:  -548.2838251234568


100%|█████████▉| 314574/315000 [16:02<00:01, 309.19it/s] 

Episode  12 Balance:  -704.3415704320968 Reward:  -704.3415704320968 Loss:  512.1299538891762
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Balance:  153.59102290123448


100%|█████████▉| 314611/315000 [16:27<01:18,  4.93it/s] 

Validation Balance:  -621.8566100617285


100%|██████████| 315000/315000 [16:28<00:00, 318.53it/s]



Iteration: 27, Gamma: 0.9693, Reward Shaping Factor: 0.7334, Peak and Valley: True
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003
Number of engineered features:  57
State dimension:  113
Number of DQN Parameters:  48003


 26%|██▋       | 82791/315000 [05:22<15:03, 256.89it/s]


KeyboardInterrupt: 