In [1]:
from AGCRLEnvBin import AGCRLEnv
from DQNAgent import DQNAgent
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm
import time
import tensorflow as tf

In [2]:
with open('observations.pickle', 'rb') as handle:
    obs = pickle.load(handle)
with open('actions.pickle', 'rb') as handle:
    actions = pickle.load(handle)

In [3]:
assim_rl_actionspace=np.linspace(0,100,21)
env=AGCRLEnv(obs,actions,"assim_sp",assim_rl_actionspace)

In [4]:
obs=env.resetinit()

In [5]:
# Environment settings
EPISODES = 1
# Exploration settings
epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

#  Stats settings
AGGREGATE_STATS_EVERY = 500  # steps
SHOW_PREVIEW = False
SAVE_MODEL_EVERY=5000
MODEL_NAME="AGCRL_ASSIM_BIN"
MIN_REWARD = -5000

In [6]:
agent = DQNAgent(env,env.action_space)

In [None]:
ep_rewards=[]
acc_regret=[0]
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
    step=0
    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = env.reset()

    # Reset flag and start iterating until episode ends
    done = False
    while not done:
        step+=1
        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, len(env.action_space))

        new_state, reward, done = env.step(action)

        # Every step we update replay memory and train main network
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)

        current_state = new_state
        ep_rewards.append(reward)
#         print(step%AGGREGATE_STATS_EVERY)
        if (step % AGGREGATE_STATS_EVERY)==0:
            print(step)
            
            average_reward = sum(ep_rewards)/len(ep_rewards)
            min_reward = min(ep_rewards)
            max_reward = max(ep_rewards)
            ar=max_reward-average_reward
            arval=acc_regret[-1]+ar
            acc_regret.append(arval)
            agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon,acc_regret=arval)
            ep_rewards=[]
            # Save model, but only when min reward is greater or equal a set value
            if min_reward >= MIN_REWARD and step%SAVE_MODEL_EVERY==0:
                agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

            if epsilon > MIN_EPSILON:
                epsilon *= EPSILON_DECAY
                epsilon = max(MIN_EPSILON, epsilon)

  0%|          | 0/1 [00:00<?, ?episodes/s]

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/AGCRL_ASSIM_BIN___100.00max____4.60avg____0.00min__1618418799.model/assets
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
INFO:tensorflow:Assets written to: models/AGCRL_ASSIM_BIN___100.00max____4.60avg____0.00min__1618419060.model/assets
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
INFO:tensorflow:Assets written to: models/AGCRL_ASSIM_BIN___100.00max____5.40avg____0.00min__1618419324.model/assets
15500
16000
16500
17000
17500
18000
18500
19000
