Connected to PRL (Python 3.11.0)

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from matplotlib import cm
import pandas as pd
from tqdm import tqdm
import random
from utils import RuleEvaluation, DDQNEvaluation, Plotter
from agent import DDQNAgent, TemporalDDQNAgent
from TestEnv import Electric_Car
seed = 2705
TRAIN = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load Data

def elongate(df):
    df_long = pd.wide_to_long(df, i = "PRICES", j = "hour", stubnames=["Hour"], sep = " ").reset_index()
    df_long.rename(columns={"Hour": "price", "PRICES": "date"}, inplace = True)
    df_long['datetime'] = pd.to_datetime(df_long['date']) + pd.to_timedelta(df_long['hour'], unit='h')
    df_long.sort_values(['datetime'], ascending=[True], inplace=True)
    df_long['price'] = df_long['price'].astype(float) 
    return df_long.reset_index(drop=True)

train_name = 'data/train.xlsx'
val_name = 'data/validate.xlsx'
train = elongate(pd.read_excel(train_name))
val = elongate(pd.read_excel(val_name))
features_train = pd.read_csv('data/features_train.csv')
features_val = pd.read_csv('data/features_val.csv')
#%%
# Define the intervals for gamma and reward shaping factor

gamma_interval = [0.93, 0.99]
reward_shaping_interval = [0.2, 0.5]
battery_factor_interval = [0.0, 0.15]
# Define the number of iterations for the random search

num_iterations = 100
for i in range(num_iterations):
    # Generate random values within the intervals
    gamma = np.random.uniform(*gamma_interval)
    reward_shaping_factor = np.random.uniform(*reward_shaping_interval)
    battery_factor = np.random.uniform(*battery_factor_interval)
    print(f'\nIteration: {i+1}, Gamma: {gamma}, Reward Shaping Factor: {reward_shaping_factor}, Battery Factor: {battery_factor}')
    seed = 2705
    rep = 420000
    batch_size = 48
    gamma = gamma
    epsilon = 1.0
    epsilon_decay = 99999
    epsilon_min = 0.1
    learning_rate = 5e-5
    price_horizon = 48
    future_horizon = 0
    hidden_dim = 128
    num_layers = 4
    positions = False
    action_classes = 3
    reward_shaping = True
    factor = reward_shaping_factor
    verbose = False
    normalize = True
    df = train_name
    # Initialize Environment
    env = Electric_Car(path_to_test_data=df)
    val_env = Electric_Car(path_to_test_data=val_name)
    #Initialize DQN
    agent = DDQNAgent(env = env,
                    features = features_train,
                    epsilon_decay = epsilon_decay,
                    epsilon_start = epsilon,
                    epsilon_end = epsilon_min,
                    discount_rate = gamma,
                    lr = learning_rate,
                    buffer_size = 100000,
                    price_horizon = price_horizon,
                    hidden_dim=hidden_dim,
                    num_layers = num_layers,
                    positions = positions,
                    action_classes = action_classes, 
                    reward_shaping = reward_shaping,
                    shaping_factor = factor,
                    normalize = normalize,
                    verbose = verbose)
    val_agent = DDQNAgent(env = val_env,
                    features = features_val,
                    epsilon_decay = epsilon_decay,
                    epsilon_start = epsilon,
                    epsilon_end = epsilon_min,
                    discount_rate = gamma,
                    lr = learning_rate,
                    buffer_size = 100000,
                    price_horizon = price_horizon,
                    hidden_dim=hidden_dim,
                    num_layers = num_layers,
                    positions = positions,
                    action_classes = action_classes, 
                    reward_shaping = reward_shaping,
                    shaping_factor = factor,
                    normalize = normalize,
                    verbose = verbose)
    episode_balance = 0
    episode_loss = 0
    episode_counter = 0
    episode_reward = 0
    obs, r, t, _, _ = env.step(random.randint(-1,1)) # Reset environment and get initial observation
    state, grads = agent.obs_to_state(obs)
    for i in tqdm(range(rep)):
        action = agent.choose_action(i, state, greedy = False) # Choose action (discrete)
        cont_action = agent.action_to_cont(action) # Convert to continuous action
        new_obs, r, t, _, _ = env.step(cont_action)
        new_state, new_grads = agent.obs_to_state(new_obs)
        # Reward Shaping            
        new_reward = agent.shape_reward(r, cont_action, grads, battery_factor = battery_factor)
        # Fill replay buffer - THIS IS THE ONLY THING WE DO WITH THE CURRENT OBSERVATION - LEARNING IS FULLY PERFORMED FROM THE REPLAY BUFFER
        if state.shape[0] == agent.state_dim and new_state.shape[0] == agent.state_dim:
            agent.replay_memory.add_data((state, action, new_reward, t, new_state))
        #Update DQN
        loss = agent.optimize(batch_size)
        # Update values
        episode_balance += r
        episode_reward += r
        episode_loss += loss
        # New observation
        state = new_state
        grads = new_grads # Gradients for reward shaping
        if t:
            # Reset Environment
            env.counter = 0
            env.hour = 1
            env.day = 1
            episode_counter += 1
            if episode_counter % 4 == 0:
                print('Episode ', episode_counter, 'Balance: ', episode_balance, 'Reward: ', episode_reward, 'Loss: ', episode_loss) # Add both balance and reward to see how training objective and actually spent money differ
            # Scheduler Step
            agent.scheduler.step(episode_loss)
            episode_loss = 0
            episode_balance = 0
            episode_reward = 0
            if episode_counter % 4 == 0:
                # Evaluate DQN
                print("Training Evaluation")
                train_dqn = DDQNEvaluation(price_horizon = price_horizon)
                train_dqn.evaluate(agent = agent)
                # Evaluate DQN
                print("Validation Evaluation")
                val_agent.dqn_predict.load_state_dict(agent.dqn_predict.state_dict())
                val_dqn = DDQNEvaluation(price_horizon = price_horizon)
                val_dqn.evaluate(agent = val_agent)
                # Reset Environment
                env.counter = 0
                env.hour = 1
                env.day = 1
                val_env.counter = 0
                val_env.hour = 1
                val_env.day = 1
    # Save agent
    torch.save(agent.dqn_predict.state_dict(), f'models/random_search_gamma{gamma}_factor_{factor}.pt')


Iteration: 1, Gamma: 0.9386139862524565, Reward Shaping Factor: 0.41420794369762703, Battery Factor: 0.06555965628369004


 25%|██▍       | 104155/420000 [06:31<21:58, 239.61it/s]

Episode  4 Balance:  -1886.3773093209911 Reward:  -1886.3773093209911 Loss:  553.1817215082701
Training Evaluation
Absolute Balance:  -164.01173197530866
Validation Evaluation


 25%|██▍       | 104191/420000 [06:56<19:59:09,  4.39it/s]

Absolute Balance:  -477.6077298765432


 50%|████▉     | 209360/420000 [13:19<12:32, 279.79it/s]  

Episode  8 Balance:  -936.6781800617249 Reward:  -936.6781800617249 Loss:  536.5711674456252
Training Evaluation


 50%|████▉     | 209360/420000 [13:30<12:32, 279.79it/s]

Absolute Balance:  -139.6463543827161
Validation Evaluation


 50%|████▉     | 209401/420000 [13:39<9:53:34,  5.91it/s] 

Absolute Balance:  -467.03703666666667


 75%|███████▍  | 314556/420000 [20:15<06:30, 269.76it/s] 

Episode  12 Balance:  -942.7328776543201 Reward:  -942.7328776543201 Loss:  476.23348377551883
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Evaluation
Absolute Balance:  -108.996668888889
Validation Evaluation


 75%|███████▍  | 314604/420000 [20:36<4:59:26,  5.87it/s]

Absolute Balance:  -497.8519831481482


100%|█████████▉| 419782/420000 [26:34<00:00, 317.72it/s] 

Episode  16 Balance:  -1020.3785594444439 Reward:  -1020.3785594444439 Loss:  465.9350751241436
Training Evaluation
Absolute Balance:  -15.925747345679095
Validation Evaluation


100%|█████████▉| 419860/420000 [26:55<00:14,  9.34it/s] 

Absolute Balance:  -409.3163317283951


100%|██████████| 420000/420000 [26:55<00:00, 259.95it/s]



Iteration: 2, Gamma: 0.9393952133978245, Reward Shaping Factor: 0.24217310583735058, Battery Factor: 0.08046569705950479


 25%|██▍       | 104162/420000 [05:27<17:01, 309.21it/s]

Episode  4 Balance:  -1918.8715816666745 Reward:  -1918.8715816666745 Loss:  582.9327457012841
Training Evaluation
Absolute Balance:  -271.3671622839507
Validation Evaluation


 25%|██▍       | 104195/420000 [05:47<14:27:35,  6.07it/s]

Absolute Balance:  -529.6453809259259


 50%|████▉     | 209371/420000 [11:44<11:19, 309.85it/s]  

Episode  8 Balance:  -1034.747459506173 Reward:  -1034.747459506173 Loss:  455.6692752710078
Training Evaluation
Absolute Balance:  -189.5644261111112
Validation Evaluation


 50%|████▉     | 209402/420000 [12:04<10:36:25,  5.52it/s]

Absolute Balance:  -536.6145829012346


 75%|███████▍  | 314577/420000 [18:12<05:24, 324.62it/s]  

Episode  12 Balance:  -1112.201430308641 Reward:  -1112.201430308641 Loss:  455.19885764003266
Training Evaluation


 75%|███████▍  | 314577/420000 [18:23<05:24, 324.62it/s]

Absolute Balance:  -11.517481975308705
Validation Evaluation


 75%|███████▍  | 314619/420000 [18:35<5:13:58,  5.59it/s]

Absolute Balance:  -524.4978338888889


 81%|████████  | 340942/420000 [20:06<04:12, 312.53it/s] 

Epoch 00013: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 419789/420000 [24:34<00:00, 281.25it/s]

Episode  16 Balance:  -1122.6142546913609 Reward:  -1122.6142546913609 Loss:  484.2536066310713
Training Evaluation
Absolute Balance:  -40.98495685185193
Validation Evaluation


100%|█████████▉| 419830/420000 [24:57<00:31,  5.46it/s] 

Absolute Balance:  -452.2569154938272


100%|██████████| 420000/420000 [24:58<00:00, 280.34it/s]



Iteration: 3, Gamma: 0.9836362659039988, Reward Shaping Factor: 0.4042027710942566, Battery Factor: 0.0961414067876096


 25%|██▍       | 104147/420000 [05:28<15:54, 330.74it/s]

Episode  4 Balance:  -1539.7049116049436 Reward:  -1539.7049116049436 Loss:  730.9029978075996
Training Evaluation
Absolute Balance:  -264.4200103703705
Validation Evaluation


 25%|██▍       | 104195/420000 [05:50<13:43:34,  6.39it/s]

Absolute Balance:  -502.57284320987657


 50%|████▉     | 209363/420000 [11:50<11:12, 313.42it/s]  

Episode  8 Balance:  -958.8064745061741 Reward:  -958.8064745061741 Loss:  724.2234262577258
Training Evaluation
Absolute Balance:  15.402747530864035
Validation Evaluation


 50%|████▉     | 209390/420000 [12:11<11:04:39,  5.28it/s]

Absolute Balance:  -450.95757117283955


 75%|███████▍  | 314563/420000 [18:10<05:43, 307.02it/s]  

Episode  12 Balance:  -1056.1988593827182 Reward:  -1056.1988593827182 Loss:  700.4617272410542
Training Evaluation
Absolute Balance:  -147.98258339506188
Validation Evaluation


 75%|███████▍  | 314620/420000 [18:31<4:14:20,  6.91it/s]

Absolute Balance:  -627.0262198765433


100%|█████████▉| 419781/420000 [24:27<00:00, 311.94it/s] 

Episode  16 Balance:  -1000.9851311111106 Reward:  -1000.9851311111106 Loss:  780.4263022902887
Training Evaluation
Absolute Balance:  59.180663271604786
Validation Evaluation


100%|█████████▉| 419823/420000 [24:48<00:29,  6.04it/s] 

Absolute Balance:  -527.7488097530866


100%|██████████| 420000/420000 [24:48<00:00, 282.07it/s]



Iteration: 4, Gamma: 0.9891596230254911, Reward Shaping Factor: 0.42305705815593553, Battery Factor: 0.055753546127395096


 25%|██▍       | 104138/420000 [05:28<16:34, 317.67it/s]

Episode  4 Balance:  -1654.810938580248 Reward:  -1654.810938580248 Loss:  785.453323637601
Training Evaluation
Absolute Balance:  -196.34728771604946
Validation Evaluation


 25%|██▍       | 104193/420000 [05:48<12:25:34,  7.06it/s]

Absolute Balance:  -564.8201830246915


 50%|████▉     | 209368/420000 [11:49<10:21, 338.98it/s]  

Episode  8 Balance:  -1009.7642270370364 Reward:  -1009.7642270370364 Loss:  755.9045505505055
Training Evaluation
Absolute Balance:  -158.02998864197548
Validation Evaluation


 50%|████▉     | 209407/420000 [12:10<9:16:01,  6.31it/s] 

Absolute Balance:  -499.91795197530877


 75%|███████▍  | 314574/420000 [18:11<05:57, 294.67it/s] 

Episode  12 Balance:  -887.728213271605 Reward:  -887.728213271605 Loss:  820.78743630182
Training Evaluation
Absolute Balance:  -146.33420722222235
Validation Evaluation


 75%|███████▍  | 314619/420000 [18:31<4:43:17,  6.20it/s]

Absolute Balance:  -477.8192091975309


 81%|████████  | 340945/420000 [19:59<04:10, 315.36it/s] 

Epoch 00013: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 419795/420000 [24:31<00:00, 296.23it/s]

Episode  16 Balance:  -938.4440624074064 Reward:  -938.4440624074064 Loss:  782.269781507086
Training Evaluation
Absolute Balance:  23.79001037037027
Validation Evaluation


100%|█████████▉| 419832/420000 [24:52<00:28,  5.84it/s] 

Absolute Balance:  -472.55079913580244


100%|██████████| 420000/420000 [24:52<00:00, 281.32it/s]



Iteration: 5, Gamma: 0.9540164566680324, Reward Shaping Factor: 0.43669508481611485, Battery Factor: 0.03798925120802284


 25%|██▍       | 104150/420000 [05:28<17:06, 307.79it/s]

Episode  4 Balance:  -1869.1807050617365 Reward:  -1869.1807050617365 Loss:  647.6055248396005
Training Evaluation


 25%|██▍       | 104150/420000 [05:41<17:06, 307.79it/s]

Absolute Balance:  -196.0284206790124
Validation Evaluation


 25%|██▍       | 104192/420000 [05:49<14:19:52,  6.12it/s]

Absolute Balance:  -419.7539215432099


 50%|████▉     | 209360/420000 [11:47<11:47, 297.75it/s]  

Episode  8 Balance:  -979.8139061728398 Reward:  -979.8139061728398 Loss:  519.3121371162124
Training Evaluation
Absolute Balance:  47.40416679012336
Validation Evaluation


 50%|████▉     | 209403/420000 [12:08<9:35:49,  6.10it/s] 

Absolute Balance:  -476.8632877160494


 75%|███████▍  | 314571/420000 [18:10<14:52, 118.08it/s] 

Episode  12 Balance:  -967.8139279629631 Reward:  -967.8139279629631 Loss:  550.1169648868963
Training Evaluation


 75%|███████▍  | 314571/420000 [18:21<14:52, 118.08it/s]

Absolute Balance:  24.029086790123316
Validation Evaluation


 75%|███████▍  | 314619/420000 [18:32<5:47:29,  5.05it/s]

Absolute Balance:  -449.84096037037045


100%|█████████▉| 419768/420000 [24:32<00:00, 308.00it/s] 

Episode  16 Balance:  -896.8537966049377 Reward:  -896.8537966049377 Loss:  499.912423727219
Training Evaluation
Absolute Balance:  -39.256336172839596
Validation Evaluation


100%|█████████▉| 419830/420000 [24:53<00:23,  7.18it/s] 

Absolute Balance:  -492.458477962963


100%|██████████| 420000/420000 [24:54<00:00, 281.10it/s]



Iteration: 6, Gamma: 0.9397869588757166, Reward Shaping Factor: 0.47129661902019, Battery Factor: 0.12545766454245721


 25%|██▍       | 104135/420000 [05:36<16:46, 313.76it/s] 

Episode  4 Balance:  -1700.4782039506213 Reward:  -1700.4782039506213 Loss:  551.0097419394879
Training Evaluation


 25%|██▍       | 104135/420000 [05:46<16:46, 313.76it/s]

Absolute Balance:  -174.79348253086428
Validation Evaluation


 25%|██▍       | 104191/420000 [05:56<12:32:21,  7.00it/s]

Absolute Balance:  -459.0991378395062


 50%|████▉     | 209351/420000 [12:22<12:35, 278.66it/s]  

Episode  8 Balance:  -871.1629851851839 Reward:  -871.1629851851839 Loss:  552.8306986261159
Training Evaluation
Absolute Balance:  -220.58577617283964
Validation Evaluation


 50%|████▉     | 209404/420000 [12:45<10:01:46,  5.83it/s]

Absolute Balance:  -573.0473780864198


 75%|███████▍  | 314561/420000 [18:45<06:15, 281.03it/s]  

Episode  12 Balance:  -1027.5484275925924 Reward:  -1027.5484275925924 Loss:  489.1501129323151
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Evaluation


 75%|███████▍  | 314561/420000 [18:57<06:15, 281.03it/s]

Absolute Balance:  -153.50716055555563
Validation Evaluation


 75%|███████▍  | 314615/420000 [19:06<4:40:59,  6.25it/s]

Absolute Balance:  -514.7737656172841


100%|█████████▉| 419792/420000 [25:06<00:00, 316.50it/s] 

Episode  16 Balance:  -983.1880074691348 Reward:  -983.1880074691348 Loss:  461.72017382015474
Training Evaluation


100%|█████████▉| 419792/420000 [25:17<00:00, 316.50it/s]

Absolute Balance:  -197.66685111111124
Validation Evaluation


100%|█████████▉| 419824/420000 [25:27<00:31,  5.55it/s] 

Absolute Balance:  -518.4334297530864


100%|██████████| 420000/420000 [25:27<00:00, 274.89it/s]



Iteration: 7, Gamma: 0.9594280160181485, Reward Shaping Factor: 0.41458204630349843, Battery Factor: 0.11668304461372409


 25%|██▍       | 104151/420000 [05:27<14:33, 361.57it/s]

Episode  4 Balance:  -1605.5056812345708 Reward:  -1605.5056812345708 Loss:  666.4626581415068
Training Evaluation


 25%|██▍       | 104151/420000 [05:38<14:33, 361.57it/s]

Absolute Balance:  -154.4121459876544
Validation Evaluation


 25%|██▍       | 104195/420000 [05:45<11:18:51,  7.75it/s]

Absolute Balance:  -507.00153500000005


 50%|████▉     | 209344/420000 [10:58<09:54, 354.16it/s]  

Episode  8 Balance:  -914.8080146296257 Reward:  -914.8080146296257 Loss:  573.6985817619134
Training Evaluation
Absolute Balance:  -38.57708074074081
Validation Evaluation


 50%|████▉     | 209416/420000 [11:17<6:28:11,  9.04it/s]

Absolute Balance:  -424.6009267901235


 75%|███████▍  | 314551/420000 [16:28<04:56, 356.08it/s] 

Episode  12 Balance:  -842.2547608641971 Reward:  -842.2547608641971 Loss:  554.4357614747714
Training Evaluation


 75%|███████▍  | 314551/420000 [16:38<04:56, 356.08it/s]

Absolute Balance:  -20.136055061728452
Validation Evaluation


 75%|███████▍  | 314619/420000 [16:46<3:20:42,  8.75it/s]

Absolute Balance:  -467.1683917283951


100%|█████████▉| 419779/420000 [21:55<00:00, 359.79it/s] 

Episode  16 Balance:  -896.5162878395045 Reward:  -896.5162878395045 Loss:  577.9526414144784
Training Evaluation
Absolute Balance:  -129.1805208641977
Validation Evaluation


100%|█████████▉| 419831/420000 [22:14<00:21,  7.83it/s] 

Absolute Balance:  -479.92626827160495


100%|██████████| 420000/420000 [22:15<00:00, 314.58it/s]



Iteration: 8, Gamma: 0.9597915474481638, Reward Shaping Factor: 0.4215170297735945, Battery Factor: 0.08117473951214306


 25%|██▍       | 104146/420000 [04:41<15:24, 341.66it/s]

Episode  4 Balance:  -1721.123782098766 Reward:  -1721.123782098766 Loss:  629.2063418303151
Training Evaluation


 25%|██▍       | 104146/420000 [04:52<15:24, 341.66it/s]

Absolute Balance:  -224.64327351851858
Validation Evaluation


 25%|██▍       | 104194/420000 [05:01<12:11:00,  7.20it/s]

Absolute Balance:  -514.0583174691359


 50%|████▉     | 209365/420000 [10:12<10:47, 325.55it/s]  

Episode  8 Balance:  -945.500704629626 Reward:  -945.500704629626 Loss:  611.830407030182
Training Evaluation


 50%|████▉     | 209365/420000 [10:22<10:47, 325.55it/s]

Absolute Balance:  -92.48272320987664
Validation Evaluation


 50%|████▉     | 209407/420000 [10:32<8:55:20,  6.56it/s] 

Absolute Balance:  -507.5344248148149


 75%|███████▍  | 314567/420000 [15:38<04:55, 356.59it/s] 

Episode  12 Balance:  -932.9816803086433 Reward:  -932.9816803086433 Loss:  524.5972525869729
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Evaluation
Absolute Balance:  34.656637716049325
Validation Evaluation


 75%|███████▍  | 314618/420000 [16:00<4:18:56,  6.78it/s]

Absolute Balance:  -427.37245296296294


100%|█████████▉| 419763/420000 [21:06<00:00, 356.20it/s] 

Episode  16 Balance:  -883.8993040740714 Reward:  -883.8993040740714 Loss:  531.5312187125674
Training Evaluation
Absolute Balance:  -77.01221018518528
Validation Evaluation


100%|█████████▉| 419829/420000 [21:25<00:19,  8.66it/s] 

Absolute Balance:  -483.5809527160494


100%|██████████| 420000/420000 [21:25<00:00, 326.64it/s]



Iteration: 9, Gamma: 0.950981342264074, Reward Shaping Factor: 0.4550778920841893, Battery Factor: 0.026989355263038955


 25%|██▍       | 104144/420000 [04:47<14:54, 353.23it/s]

Episode  4 Balance:  -1820.4529061728438 Reward:  -1820.4529061728438 Loss:  604.3911272750702
Training Evaluation
Absolute Balance:  -95.98457882716056
Validation Evaluation


 25%|██▍       | 104194/420000 [05:06<11:16:34,  7.78it/s]

Absolute Balance:  -366.02467000000007


 50%|████▉     | 209352/420000 [10:18<10:04, 348.28it/s]  

Episode  8 Balance:  -1028.5229887654305 Reward:  -1028.5229887654305 Loss:  572.1823608280392
Training Evaluation
Absolute Balance:  -72.48323339506179
Validation Evaluation


 50%|████▉     | 209408/420000 [10:37<7:14:50,  8.07it/s] 

Absolute Balance:  -463.17480950617295


 75%|███████▍  | 314572/420000 [15:45<04:53, 359.31it/s] 

Episode  12 Balance:  -1065.0676274691352 Reward:  -1065.0676274691352 Loss:  503.98005747713614
Training Evaluation


 75%|███████▍  | 314572/420000 [15:55<04:53, 359.31it/s]

Absolute Balance:  -81.83003623456794
Validation Evaluation


 75%|███████▍  | 314619/420000 [16:04<3:54:48,  7.48it/s]

Absolute Balance:  -527.4236822222223


100%|█████████▉| 419791/420000 [21:15<00:00, 356.43it/s] 

Episode  16 Balance:  -907.7087941358025 Reward:  -907.7087941358025 Loss:  537.174583009677
Training Evaluation


100%|█████████▉| 419791/420000 [21:26<00:00, 356.43it/s]

Absolute Balance:  -29.48854376543216
Validation Evaluation


100%|█████████▉| 419828/420000 [21:34<00:24,  7.06it/s] 

Absolute Balance:  -455.5592320370371


100%|██████████| 420000/420000 [21:35<00:00, 324.30it/s]



Iteration: 10, Gamma: 0.975706211693285, Reward Shaping Factor: 0.31901847580040754, Battery Factor: 0.1004644156240743


 25%|██▍       | 104138/420000 [04:42<18:27, 285.31it/s]

Episode  4 Balance:  -1714.7214590740784 Reward:  -1714.7214590740784 Loss:  679.4190906295553
Training Evaluation
Absolute Balance:  -117.42850956790132
Validation Evaluation


 25%|██▍       | 104196/420000 [05:02<12:08:10,  7.23it/s]

Absolute Balance:  -431.17240104938276


 50%|████▉     | 209355/420000 [10:10<10:46, 325.65it/s]  

Episode  8 Balance:  -838.279815555557 Reward:  -838.279815555557 Loss:  626.1844594525173
Training Evaluation
Absolute Balance:  -19.337697777777855
Validation Evaluation


 50%|████▉     | 209406/420000 [10:31<8:26:34,  6.93it/s] 

Absolute Balance:  -525.1711144444445


 75%|███████▍  | 314554/420000 [15:37<05:03, 347.64it/s] 

Episode  12 Balance:  -913.7182615432083 Reward:  -913.7182615432083 Loss:  634.4588743813802
Training Evaluation


 75%|███████▍  | 314554/420000 [15:50<05:03, 347.64it/s]

Absolute Balance:  -8.845470432098836
Validation Evaluation


 75%|███████▍  | 314617/420000 [15:58<3:57:35,  7.39it/s]

Absolute Balance:  -523.5278677160495


 94%|█████████▎| 393555/420000 [19:52<01:14, 354.87it/s] 

Epoch 00015: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 419794/420000 [21:08<00:00, 349.22it/s]

Episode  16 Balance:  -765.9683901234563 Reward:  -765.9683901234563 Loss:  670.1470293723978
Training Evaluation


100%|█████████▉| 419794/420000 [21:20<00:00, 349.22it/s]

Absolute Balance:  24.555603395061674
Validation Evaluation


100%|█████████▉| 419829/420000 [21:29<00:27,  6.11it/s] 

Absolute Balance:  -471.3528348765433


100%|██████████| 420000/420000 [21:30<00:00, 325.47it/s]



Iteration: 11, Gamma: 0.9873110587278168, Reward Shaping Factor: 0.25431850336132306, Battery Factor: 0.03485932850517898


 25%|██▍       | 104151/420000 [04:40<14:48, 355.47it/s]

Episode  4 Balance:  -1637.9801430864252 Reward:  -1637.9801430864252 Loss:  688.8890369117726
Training Evaluation
Absolute Balance:  -97.30179462962968
Validation Evaluation


 25%|██▍       | 104196/420000 [04:58<11:21:54,  7.72it/s]

Absolute Balance:  -396.148652962963


 50%|████▉     | 209370/420000 [10:06<09:48, 358.10it/s]  

Episode  8 Balance:  -865.4206817901226 Reward:  -865.4206817901226 Loss:  591.0144985464867
Training Evaluation
Absolute Balance:  -91.39145808641982
Validation Evaluation


 50%|████▉     | 209408/420000 [10:25<8:01:57,  7.28it/s] 

Absolute Balance:  -473.9702962962963


 75%|███████▍  | 314584/420000 [15:33<04:55, 357.09it/s] 

Episode  12 Balance:  -816.201438395061 Reward:  -816.201438395061 Loss:  740.655090350192
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Evaluation
Absolute Balance:  -1.295308765432205
Validation Evaluation


 75%|███████▍  | 314614/420000 [15:51<4:23:07,  6.68it/s]

Absolute Balance:  -424.4868844444445


100%|█████████▉| 419778/420000 [21:00<00:01, 204.35it/s] 

Episode  16 Balance:  -926.0441591975336 Reward:  -926.0441591975336 Loss:  794.3004366005771
Training Evaluation
Absolute Balance:  119.52561765432088
Validation Evaluation


100%|█████████▉| 419832/420000 [21:18<00:21,  7.82it/s] 

Absolute Balance:  -381.02949123456796


100%|██████████| 420000/420000 [21:19<00:00, 328.35it/s]



Iteration: 12, Gamma: 0.9749867671626784, Reward Shaping Factor: 0.4047128409208095, Battery Factor: 0.02098065401362173


 25%|██▍       | 104143/420000 [04:42<16:17, 323.28it/s]

Episode  4 Balance:  -1665.8302372222247 Reward:  -1665.8302372222247 Loss:  666.4020686862059
Training Evaluation
Absolute Balance:  -92.42536086419759
Validation Evaluation


 25%|██▍       | 104199/420000 [05:01<11:07:19,  7.89it/s]

Absolute Balance:  -448.7944916666667


 50%|████▉     | 209348/420000 [10:09<11:55, 294.43it/s]  

Episode  8 Balance:  -1004.0359483333327 Reward:  -1004.0359483333327 Loss:  638.353292994434
Training Evaluation
Absolute Balance:  -51.61272524691367
Validation Evaluation


 50%|████▉     | 209408/420000 [10:28<7:25:36,  7.88it/s] 

Absolute Balance:  -506.2005400000001


 75%|███████▍  | 314554/420000 [15:37<05:21, 328.06it/s] 

Episode  12 Balance:  -1026.058992530863 Reward:  -1026.058992530863 Loss:  597.4885432377923
Training Evaluation


 75%|███████▍  | 314554/420000 [15:48<05:21, 328.06it/s]

Absolute Balance:  11.334055308641892
Validation Evaluation


 75%|███████▍  | 314618/420000 [15:56<3:30:10,  8.36it/s]

Absolute Balance:  -446.67272858024694


100%|█████████▉| 419788/420000 [21:04<00:00, 358.27it/s] 

Episode  16 Balance:  -911.9527163580228 Reward:  -911.9527163580228 Loss:  598.388598118443
Training Evaluation
Absolute Balance:  65.09925950617273
Validation Evaluation


100%|█████████▉| 419832/420000 [21:22<00:21,  7.64it/s] 

Absolute Balance:  -486.177495617284


100%|██████████| 420000/420000 [21:23<00:00, 327.29it/s]



Iteration: 13, Gamma: 0.9674909807672836, Reward Shaping Factor: 0.21196870498392598, Battery Factor: 0.08731548974270727


 25%|██▍       | 104134/420000 [04:53<14:38, 359.55it/s]

Episode  4 Balance:  -1728.916680185187 Reward:  -1728.916680185187 Loss:  590.2057777279988
Training Evaluation


 25%|██▍       | 104134/420000 [05:04<14:38, 359.55it/s]

Absolute Balance:  -48.45920537037046
Validation Evaluation


 25%|██▍       | 104194/420000 [05:12<10:28:36,  8.37it/s]

Absolute Balance:  -462.78265981481485


 50%|████▉     | 209351/420000 [10:29<09:51, 356.28it/s]  

Episode  8 Balance:  -916.8411653086414 Reward:  -916.8411653086414 Loss:  573.3759027549531
Training Evaluation
Absolute Balance:  -75.3088117901235
Validation Evaluation


 50%|████▉     | 209407/420000 [10:48<7:12:29,  8.12it/s] 

Absolute Balance:  -554.966493888889


 75%|███████▍  | 314572/420000 [16:06<06:38, 264.86it/s] 

Episode  12 Balance:  -943.4275096296295 Reward:  -943.4275096296295 Loss:  545.8246616406832
Epoch 00012: reducing learning rate of group 0 to 2.5000e-05.
Training Evaluation
Absolute Balance:  -55.14006895061734
Validation Evaluation


 75%|███████▍  | 314620/420000 [16:25<4:10:15,  7.02it/s]

Absolute Balance:  -527.9884044444445


100%|█████████▉| 419793/420000 [21:34<00:00, 351.72it/s] 

Episode  16 Balance:  -916.1870640740738 Reward:  -916.1870640740738 Loss:  554.6238578668563
Training Evaluation


100%|█████████▉| 419793/420000 [21:44<00:00, 351.72it/s]

Absolute Balance:  29.108633395061695
Validation Evaluation


100%|█████████▉| 419828/420000 [21:53<00:24,  6.88it/s] 

Absolute Balance:  -452.3712883333334


100%|██████████| 420000/420000 [21:53<00:00, 319.68it/s]



Iteration: 14, Gamma: 0.9393257120584706, Reward Shaping Factor: 0.28802302318963774, Battery Factor: 0.040215179099265204


 25%|██▍       | 104126/420000 [05:23<14:59, 350.99it/s]

Episode  4 Balance:  -1834.3979216049422 Reward:  -1834.3979216049422 Loss:  565.6771217507776
Training Evaluation
Absolute Balance:  -143.59341808641977
Validation Evaluation


 25%|██▍       | 104192/420000 [05:41<10:08:02,  8.66it/s]

Absolute Balance:  -459.48022222222227


 50%|████▉     | 209357/420000 [10:54<09:52, 355.38it/s]  

Episode  8 Balance:  -1033.3375850617326 Reward:  -1033.3375850617326 Loss:  486.05105637805536
Training Evaluation
Absolute Balance:  12.72355012345673
Validation Evaluation


 50%|████▉     | 209408/420000 [11:13<7:31:35,  7.77it/s] 

Absolute Balance:  -464.242342962963


 69%|██████▊   | 288341/420000 [15:07<06:10, 354.94it/s] 

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


 75%|███████▍  | 314557/420000 [16:23<05:18, 331.54it/s]

Episode  12 Balance:  -1051.6724492592605 Reward:  -1051.6724492592605 Loss:  470.5667727353284
Training Evaluation
Absolute Balance:  11.481638395061653
Validation Evaluation


 75%|███████▍  | 314616/420000 [16:42<3:40:17,  7.97it/s]

Absolute Balance:  -441.44203327160506


100%|█████████▉| 419793/420000 [21:52<00:00, 268.86it/s] 

Episode  16 Balance:  -989.1649282098773 Reward:  -989.1649282098773 Loss:  459.9032892086543
Training Evaluation
Absolute Balance:  -59.977853518518636
Validation Evaluation


100%|█████████▉| 419830/420000 [22:12<00:30,  5.57it/s] 

Absolute Balance:  -531.0873305555556


100%|██████████| 420000/420000 [22:13<00:00, 315.00it/s]



Iteration: 15, Gamma: 0.9653462233237597, Reward Shaping Factor: 0.415791835675556, Battery Factor: 0.1131240248074878


 25%|██▍       | 104147/420000 [05:16<20:11, 260.61it/s]

Episode  4 Balance:  -1704.9894856790088 Reward:  -1704.9894856790088 Loss:  639.1402037816588
Training Evaluation
Absolute Balance:  -144.14069290123467
Validation Evaluation


 25%|██▍       | 104191/420000 [05:37<14:11:45,  6.18it/s]

Absolute Balance:  -471.5688355555556


 50%|████▉     | 209353/420000 [12:00<11:23, 308.27it/s]  

Episode  8 Balance:  -874.7338558024705 Reward:  -874.7338558024705 Loss:  567.4356505137403
Training Evaluation
Absolute Balance:  -197.67709339506183
Validation Evaluation


 50%|████▉     | 209407/420000 [12:20<8:34:29,  6.82it/s] 

Absolute Balance:  -432.0914794444445


 75%|███████▍  | 314563/420000 [18:24<05:40, 309.57it/s] 

Episode  12 Balance:  -886.5247262345681 Reward:  -886.5247262345681 Loss:  604.9582977471873
Training Evaluation


 75%|███████▍  | 314563/420000 [18:36<05:40, 309.57it/s]

Absolute Balance:  5.7532189506171925
Validation Evaluation


 75%|███████▍  | 314615/420000 [18:46<4:40:30,  6.26it/s]

Absolute Balance:  -449.74505685185187


 87%|████████▋ | 367246/420000 [21:47<02:49, 311.10it/s] 

Epoch 00014: reducing learning rate of group 0 to 2.5000e-05.


100%|█████████▉| 419769/420000 [24:50<00:00, 316.18it/s]

Episode  16 Balance:  -882.056291666669 Reward:  -882.056291666669 Loss:  556.3672705385834
Training Evaluation
Absolute Balance:  -76.45704691358034
Validation Evaluation


100%|█████████▉| 419829/420000 [25:11<00:23,  7.13it/s] 

Absolute Balance:  -453.7297227777778


100%|██████████| 420000/420000 [25:12<00:00, 277.68it/s]



Iteration: 16, Gamma: 0.9438093703614144, Reward Shaping Factor: 0.376794950067196, Battery Factor: 0.07098797705931267


 25%|██▍       | 104161/420000 [05:35<19:46, 266.12it/s]

Episode  4 Balance:  -1666.694302283957 Reward:  -1666.694302283957 Loss:  643.5423160144128
Training Evaluation
Absolute Balance:  -187.7973947530865
Validation Evaluation


 25%|██▍       | 104222/420000 [05:57<11:47:41,  7.44it/s]

Absolute Balance:  -524.7797330864198


 50%|████▉     | 209352/420000 [12:02<11:10, 314.39it/s]  

Episode  8 Balance:  -904.8809841358024 Reward:  -904.8809841358024 Loss:  508.0704006694723
Training Evaluation
Absolute Balance:  -11.536572160493872
Validation Evaluation


 50%|████▉     | 209403/420000 [12:23<8:57:03,  6.54it/s] 

Absolute Balance:  -505.7634453703704


 75%|███████▍  | 314557/420000 [18:31<05:12, 337.35it/s] 

Episode  12 Balance:  -921.7165715432109 Reward:  -921.7165715432109 Loss:  576.413148976746
Training Evaluation


 75%|███████▍  | 314557/420000 [18:42<05:12, 337.35it/s]

Absolute Balance:  97.27813759259251
Validation Evaluation


 75%|███████▍  | 314613/420000 [18:53<4:18:10,  6.80it/s]

Absolute Balance:  -496.00255530864206


100%|█████████▉| 419768/420000 [24:57<00:00, 316.05it/s] 

Episode  16 Balance:  -952.6572326543195 Reward:  -952.6572326543195 Loss:  533.6235569461714
Epoch 00016: reducing learning rate of group 0 to 2.5000e-05.
Training Evaluation
Absolute Balance:  31.751116172839424
Validation Evaluation


100%|█████████▉| 419830/420000 [25:18<00:24,  7.03it/s] 

Absolute Balance:  -487.15500049382723


100%|██████████| 420000/420000 [25:19<00:00, 276.45it/s]



Iteration: 17, Gamma: 0.9316351676231056, Reward Shaping Factor: 0.45138004704424844, Battery Factor: 0.11209894126818201


 25%|██▍       | 104154/420000 [05:39<17:26, 301.82it/s]

Episode  4 Balance:  -1850.7377911111125 Reward:  -1850.7377911111125 Loss:  625.7296593729407
Training Evaluation


 25%|██▍       | 104154/420000 [05:51<17:26, 301.82it/s]

Absolute Balance:  -166.6448187654322
Validation Evaluation


 25%|██▍       | 104187/420000 [05:59<15:26:56,  5.68it/s]

Absolute Balance:  -428.2878094444445


 50%|████▉     | 209374/420000 [12:05<11:58, 293.25it/s]  

Episode  8 Balance:  -1054.2243992592594 Reward:  -1054.2243992592594 Loss:  487.90376187476795
Training Evaluation
Absolute Balance:  -193.75228172839513
Validation Evaluation


 50%|████▉     | 209405/420000 [12:26<10:52:12,  5.38it/s]

Absolute Balance:  -470.40120932098773


 75%|███████▍  | 314564/420000 [18:33<05:43, 306.87it/s]  

Episode  12 Balance:  -970.709854506174 Reward:  -970.709854506174 Loss:  489.1788436725037
Training Evaluation
Absolute Balance:  -40.08182716049393
Validation Evaluation


 75%|███████▍  | 314617/420000 [18:54<4:27:08,  6.57it/s]

Absolute Balance:  -364.265500617284


100%|█████████▉| 419769/420000 [25:00<00:00, 313.27it/s] 

Episode  16 Balance:  -956.4025380864158 Reward:  -956.4025380864158 Loss:  434.0803630842129
Training Evaluation


100%|█████████▉| 419769/420000 [25:12<00:00, 313.27it/s]

Absolute Balance:  -33.49335419753091
Validation Evaluation


100%|█████████▉| 419825/420000 [25:22<00:26,  6.59it/s] 

Absolute Balance:  -421.0557570370371


100%|██████████| 420000/420000 [25:22<00:00, 275.84it/s]



Iteration: 18, Gamma: 0.9626166516560957, Reward Shaping Factor: 0.29483870417489705, Battery Factor: 0.14027895309691588


 25%|██▍       | 104137/420000 [05:41<18:05, 290.95it/s]

Episode  4 Balance:  -1638.4964980864227 Reward:  -1638.4964980864227 Loss:  572.5237871718127
Training Evaluation
Absolute Balance:  -110.07112845679022
Validation Evaluation


 25%|██▍       | 104195/420000 [06:01<12:37:47,  6.95it/s]

Absolute Balance:  -448.2606600617284


 50%|████▉     | 209344/420000 [12:05<11:38, 301.41it/s]  

Episode  8 Balance:  -899.49241191358 Reward:  -899.49241191358 Loss:  591.8649054206908
Training Evaluation


 50%|████▉     | 209344/420000 [12:18<11:38, 301.41it/s]

Absolute Balance:  -36.21349777777787
Validation Evaluation


 50%|████▉     | 209400/420000 [12:27<9:06:10,  6.43it/s] 

Absolute Balance:  -467.16618876543214


 75%|███████▍  | 314564/420000 [18:31<06:55, 253.89it/s] 

Episode  12 Balance:  -958.669169320988 Reward:  -958.669169320988 Loss:  578.0554506228073
Training Evaluation
Absolute Balance:  100.00888981481475
Validation Evaluation


 75%|███████▍  | 314618/420000 [18:52<4:38:18,  6.31it/s]

Absolute Balance:  -461.45126808641976


100%|█████████▉| 419788/420000 [24:57<00:00, 292.40it/s] 

Episode  16 Balance:  -858.3696587037018 Reward:  -858.3696587037018 Loss:  578.6799909225665
Training Evaluation


100%|█████████▉| 419788/420000 [25:09<00:00, 292.40it/s]

Absolute Balance:  87.2331429012345
Validation Evaluation


100%|█████████▉| 419827/420000 [25:18<00:30,  5.75it/s] 

Absolute Balance:  -494.5207150617284


100%|██████████| 420000/420000 [25:19<00:00, 276.48it/s]



Iteration: 19, Gamma: 0.9730507020317265, Reward Shaping Factor: 0.23970204469615475, Battery Factor: 0.002411618561052886


 25%|██▍       | 104151/420000 [05:44<18:41, 281.53it/s]

Episode  4 Balance:  -1730.0564664814904 Reward:  -1730.0564664814904 Loss:  647.813070924487
Training Evaluation


 25%|██▍       | 104151/420000 [05:58<18:41, 281.53it/s]

Absolute Balance:  62.13409117283943
Validation Evaluation


 25%|██▍       | 104163/420000 [06:10<29:09:28,  3.01it/s]

Absolute Balance:  -505.24656932098776


 50%|████▉     | 209351/420000 [12:17<11:32, 304.22it/s]  

Episode  8 Balance:  -970.0221114814798 Reward:  -970.0221114814798 Loss:  627.6547046760097
Training Evaluation


 50%|████▉     | 209351/420000 [12:28<11:32, 304.22it/s]

Absolute Balance:  -18.666481172839532
Validation Evaluation


 50%|████▉     | 209401/420000 [12:40<10:16:04,  5.70it/s]

Absolute Balance:  -507.90910722222225


 69%|██████▊   | 288316/420000 [17:14<07:01, 312.39it/s]  

Epoch 00011: reducing learning rate of group 0 to 2.5000e-05.


 75%|███████▍  | 314556/420000 [18:48<05:49, 301.65it/s]

Episode  12 Balance:  -836.16807419753 Reward:  -836.16807419753 Loss:  648.9335059828591
Training Evaluation
Absolute Balance:  70.47537364197528
Validation Evaluation


 75%|███████▍  | 314619/420000 [19:10<4:14:28,  6.90it/s]

Absolute Balance:  -416.774083271605


100%|█████████▉| 419778/420000 [25:26<00:00, 266.65it/s] 

Episode  16 Balance:  -1044.4156960493842 Reward:  -1044.4156960493842 Loss:  669.954153199913
Training Evaluation


100%|█████████▉| 419778/420000 [25:39<00:00, 266.65it/s]

Absolute Balance:  9.597404135802483
Validation Evaluation


100%|█████████▉| 419826/420000 [25:48<00:30,  5.75it/s] 

Absolute Balance:  -458.88434759259263


100%|██████████| 420000/420000 [25:48<00:00, 271.17it/s]



Iteration: 20, Gamma: 0.9389725420676802, Reward Shaping Factor: 0.4566506095260564, Battery Factor: 0.08591533102894808


 25%|██▍       | 104162/420000 [05:37<17:16, 304.70it/s]

Episode  4 Balance:  -1761.5530870370383 Reward:  -1761.5530870370383 Loss:  489.6023966127541
Training Evaluation


 25%|██▍       | 104162/420000 [05:48<17:16, 304.70it/s]

Absolute Balance:  -97.63269549382721
Validation Evaluation


 25%|██▍       | 104180/420000 [05:58<18:36:48,  4.71it/s]

Absolute Balance:  -444.84101882716055


 50%|████▉     | 209358/420000 [12:10<12:44, 275.38it/s]  

Episode  8 Balance:  -937.5320106790108 Reward:  -937.5320106790108 Loss:  495.20154923794325
Training Evaluation
Absolute Balance:  -154.24685629629641
Validation Evaluation


 50%|████▉     | 209400/420000 [12:35<11:58:00,  4.89it/s]

Absolute Balance:  -484.1189742592593


 75%|███████▍  | 314571/420000 [18:56<06:31, 269.10it/s]  

Episode  12 Balance:  -997.940703086421 Reward:  -997.940703086421 Loss:  480.1487745830091
Training Evaluation


 75%|███████▍  | 314571/420000 [19:09<06:31, 269.10it/s]

Absolute Balance:  -35.09340074074077
Validation Evaluation


 75%|███████▍  | 314613/420000 [19:20<5:49:53,  5.02it/s]

Absolute Balance:  -439.3914601851853


100%|█████████▉| 419773/420000 [25:45<00:00, 291.99it/s] 

Episode  16 Balance:  -1059.8802365432102 Reward:  -1059.8802365432102 Loss:  445.79299617162906
Training Evaluation
Absolute Balance:  -2.540442222222257
Validation Evaluation


100%|█████████▉| 419813/420000 [26:07<00:35,  5.31it/s] 

Absolute Balance:  -389.7465424691359


100%|██████████| 420000/420000 [26:08<00:00, 267.81it/s]



Iteration: 21, Gamma: 0.9464765820604929, Reward Shaping Factor: 0.33009247537535447, Battery Factor: 0.11679691127552702


 25%|██▍       | 104149/420000 [07:06<26:17, 200.22it/s] 

Episode  4 Balance:  -1792.1049464814876 Reward:  -1792.1049464814876 Loss:  522.6112925102934
Training Evaluation


 25%|██▍       | 104149/420000 [07:19<26:17, 200.22it/s]

Absolute Balance:  -28.940323271604996
Validation Evaluation


 25%|██▍       | 104169/420000 [07:39<39:53:52,  2.20it/s]

Absolute Balance:  -460.3190798765433


 50%|████▉     | 209372/420000 [18:16<17:13, 203.90it/s]  

Episode  8 Balance:  -936.0091547530844 Reward:  -936.0091547530844 Loss:  490.3145768214017
Training Evaluation


 50%|████▉     | 209372/420000 [18:29<17:13, 203.90it/s]

Absolute Balance:  72.4780304938271
Validation Evaluation


 50%|████▉     | 209375/420000 [18:47<32:47:39,  1.78it/s]

Absolute Balance:  -450.8457662962963


 75%|███████▍  | 314567/420000 [29:32<07:11, 244.19it/s]  

Episode  12 Balance:  -857.4889871604922 Reward:  -857.4889871604922 Loss:  496.89044427662157
Training Evaluation
Absolute Balance:  17.69481475308636
Validation Evaluation


 75%|███████▍  | 314608/420000 [29:56<6:40:44,  4.38it/s]

Absolute Balance:  -433.87253802469144


 81%|████████  | 340850/420000 [32:59<09:48, 134.52it/s] 