In [None]:
import time
import copy
import numpy as np
import pandas as pd
import random
from collections import namedtuple, deque
import os
import math
from itertools import count
import pdb
import warnings

from visualize_helper import *
from stock_environment import *

pd.set_option('display.max_columns', 500)
warnings.simplefilter('ignore', UserWarning)

In [None]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

### Current implementation for experiments <br>
**if FC=True:** <br>
FC1_UNITS = 100 <br>
FC2_UNITS = 100 <br>

**if FC=False:** <br>
FC1_UNITS = 32 <br>
FC2_UNITS = 64 <br>


In [None]:
BUFFER_SIZE = 500 #200
BATCH_SIZE = 100 #20
GAMMA = 0.97
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995

TAU = 1e-3 # for soft update of target parameters
LR = 5e-4
PRINT_EVERY = 5
UPDATE_EVERY = 4
FC = False # FC or CONV?

if FC:
    FC1_UNITS = 100
    FC2_UNITS = 100
else:
    FC1_UNITS = 32
    FC2_UNITS = 64

NUM_EPISODES = 400
REMARK  = 'env_2_buffer_500_batch_100_episode_400_conv'

## Read Data

In [None]:
df = pd.read_csv('../df_apple.csv')

In [None]:
df.head()

In [None]:
df = df.set_index('Date')

In [None]:
df.head()

## Visualize data

In [None]:
date_split = '2018-01-02'

plot_train_test(df, date_split)

## Split data into train and valid data

In [None]:
# Get training and test data
train_data = df[df['year']<=2017]
valid_data = df[df['year']>2017]

train_data = train_data.drop(['year'],axis=1)
valid_data = valid_data.drop(['year'],axis=1)

print('Train Size: ', train_data.shape[0])
print('Valid Size: ', valid_data.shape[0])

## Introduce environment

Environment information have been written in *stock_environment.py*.

In [None]:
env = Environment2(train_data)
print(env.reset())
for _ in range(10):
    pact = np.random.randint(3)
    print('action: ', pact)
    print(env.step(pact))

## Dueling Double DQN Implementation

In [None]:
class DDDQN_FC(nn.Module):
    """Define Dueling Double DQN architecture."""
    
    def __init__(self, state_size, action_size, seed, fcl_units, fc2_units):
        """Initialize parameters and build model."""
        
        super(DDDQN_FC, self).__init__()
        
        self.seed = torch.manual_seed(seed)
        
        self.fc1_layer = nn.Linear(state_size, fcl_units)
        self.fc2_layer = nn.Linear(fcl_units, fc2_units)
        self.fc3_state_value = nn.Linear(fc2_units, fc2_units//2) # for state-value
        self.fc3_advantage_value = nn.Linear(fc2_units, fc2_units//2) # for state-dependent action advantages
        self.state_value = nn.Linear(fc2_units//2, 1)
        self.advantage_value = nn.Linear(fc2_units//2, action_size)
        
    def forward(self, state):
        """Build a network that maps state into action values."""
        
        hidden = F.relu(self.fc1_layer(state))
        hidden = F.relu(self.fc2_layer(hidden))
        hidden_state = F.relu(self.fc3_state_value(hidden))
        hidden_advantage = F.relu(self.fc3_advantage_value(hidden))
        
        state_value_output = self.state_value(hidden_state)
        advantage_value_output = self.advantage_value(hidden_advantage)
        
        Q_value = state_value_output + (advantage_value_output - advantage_value_output.mean())
        
        return Q_value
    

class DDDQN_CONV(nn.Module):
    """Define DQN architecture."""   

    def __init__(self, state_size, action_size, seed, fcl_units, fc2_units):
        super(DDDQN_CONV, self).__init__()
        self.state_size = state_size
        self.acion_size = action_size
        
        self.conv_net = nn.Sequential(
            nn.Conv1d(self.state_size, fcl_units, kernel_size=1, stride=2),
            nn.ReLU(),
            nn.Conv1d(fcl_units, fc2_units, kernel_size=1, stride=2),
            nn.ReLU(),
            nn.Conv1d(fc2_units, fc2_units, kernel_size=1, stride=1),
            nn.ReLU()
        )

        self.value_stream = nn.Sequential(
            nn.Linear(self.feature_size(), 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
        self.advantage_stream = nn.Sequential(
            nn.Linear(self.feature_size(), 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, action_size)
        )

    def forward(self, state):
        features = self.conv_net(state)
        features = features.view(features.size(0), -1)
        
        state_value_output = self.value_stream(features)
        advantage_value_output = self.advantage_stream(features)
        
        Qsa = state_value_output + (advantage_value_output - advantage_value_output.mean())
        return Qsa

    def feature_size(self):
        return self.conv_net(autograd.Variable(torch.zeros(1, self.state_size, 1))).view(1, -1).size(1)

In [None]:
class ReplayBuffer(object):
    """Fixed-size buffer to store experience tuples."""
    
    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayMemory object."""
        
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to buffer."""
        
        self.memory.append(self.experience(state, action, reward, next_state, done))
        
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)
        
        states = torch.from_numpy(np.vstack([exp.state for exp in experiences if exp is not None])).float()
        states = states.to(device)
        
        actions = torch.from_numpy(np.vstack([exp.action for exp in experiences if exp is not None])).long()
        actions = actions.to(device)
        
        rewards = torch.from_numpy(np.vstack([exp.reward for exp in experiences if exp is not None])).float()
        rewards = rewards.to(device)
        
        next_states = torch.from_numpy(np.vstack([exp.next_state for exp in experiences if exp is not None])).float()
        next_states = next_states.to(device)
        
        dones = torch.from_numpy(np.vstack([exp.done for exp in experiences if exp is not None]).astype(np.uint8)).float()
        dones = dones.to(device)
        
        return (states, actions, rewards, next_states, dones)
        
    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

## DDDQN Agent

In [None]:
class DDDQNAgent():
    """The agent interacting with and learning from the environment."""
    
    def __init__(self, state_size, action_size, seed, fcl_units=64, fc2_units=64, fc=True):
        """Initialize an agent object."""
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.fc = fc
        
        if self.fc:
        
            # Q-Network
            self.dqn_net = DDDQN_FC(state_size, action_size, seed, fcl_units, fc2_units).to(device)
            self.target_net = DDDQN_FC(state_size, action_size, seed, fcl_units, fc2_units).to(device)
        
        else:
            
            # Q-Network
            self.dqn_net = DDDQN_CONV(state_size, action_size, seed, fcl_units, fc2_units).to(device)
            self.target_net = DDDQN_CONV(state_size, action_size, seed, fcl_units, fc2_units).to(device)
        
        self.target_net.load_state_dict(self.dqn_net.state_dict())
        self.target_net.eval()
        
        self.optimizer = optim.Adam(self.dqn_net.parameters(), lr=LR)
        
        # Replay Buffer
        self.buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.time_step = 0
        
    def memorize(self, state, action, reward, next_state, done):
        """Save experience in replay buffer."""
        
        self.buffer.add(state, action, reward, next_state, done)
    
        # if enough samples are available in memory, get random subset and learn
        if len(self.buffer) > BATCH_SIZE:
            experiences = self.buffer.sample()
            self.learn(experiences, GAMMA)
            
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy."""
        
        #state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        state = torch.FloatTensor(state).float().unsqueeze(0).to(device)
        
        if not self.fc:
            state = torch.unsqueeze(state, 2)
        
        self.dqn_net.eval()
        with torch.no_grad():
            action_values = self.dqn_net(state)
        self.dqn_net.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()) # exploit
        else:
            return random.choice(np.arange(self.action_size)) # explore

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples."""
    
        states, actions, rewards, next_states, dones = experiences
        
        if not self.fc:
            states = torch.unsqueeze(states, 2)
            next_states = torch.unsqueeze(next_states, 2)

        ##### Difference between DQN and DDQN is here #####
        
        # get the action indices of max predicted Q values (for next states) from policy network
        Q_next_best_action = self.dqn_net(next_states).detach().max(1)[1].unsqueeze(1)
        
        # use target network to get the Q values (for next states) with that action 
        Q_target_next = self.target_net(next_states).detach().gather(1, Q_next_best_action)
        
        # compute Q target
        #### original Q_target
        # Q_target = rewards + (gamma * Q_target_next) 
        
        # if current state is end of episode, then there is no next Q value
        Q_target = rewards + (gamma * Q_target_next * (1 - dones)) 
        
        # get expected Q values from dqn network
        Q_expected = self.dqn_net(states).gather(1, actions)
        
        # compute loss
        loss = F.mse_loss(Q_target, Q_expected)
        
        # zero-out the gradients of weights and biases before back-propagation 
        # so that the gradients are not accumulated
        self.optimizer.zero_grad()
        
        # minimize the loss
        loss.backward()
        self.optimizer.step()
        
        self.time_step = (self.time_step + 1) % UPDATE_EVERY
        
        if self.time_step == 0:
        
            # update target network
            self.soft_update(self.dqn_net, self.target_net, TAU)
        
    def soft_update(self, dqn_net, target_net, tau):
        """Soft update target network parameters."""
        
        for dqn_param, target_param in zip(dqn_net.parameters(), target_net.parameters()):
            target_param.data.copy_(tau*dqn_param.data + (1.0-tau) * target_param.data)

In [None]:
agent = DDDQNAgent(state_size=env.history_t+1, action_size=3, seed=0, fcl_units=FC1_UNITS, fc2_units=FC2_UNITS, fc=FC)

In [None]:
# watch an untrained agent
state = env.reset()
for time_step in range(200):
    
    # select an action
    action = agent.act(state)
    
    next_state, reward, done = env.step(action)
    state = next_state
    if done:
        break

## Train the Agent

In [None]:
def train_agent(num_episodes, remark='default', max_time=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, fc=True, save=True):
    """Train DDDQN agent."""
    
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    steps_done = 0
    start = time.time()
    
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        score = 0
        for time_step in range(max_time):
            action = agent.act(state, eps)
            next_state, reward, done = env.step(action)
            
            agent.memorize(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
                
        scores_window.append(score)
        scores.append(score)

        eps = max(eps_end, eps_decay*eps)
        
        if i_episode % PRINT_EVERY == 0:
            avg_score = np.mean(scores_window)
            elapsed_time = time.time()-start
            print(f'Episode: {i_episode}, Average Score: {avg_score:.2f}, Elapsed Time: {elapsed_time:.3f}')
            start = time.time()
        
    print('Training completed.')
    
    if save:
        if not os.path.exists('./agents/'): 
            os.makedirs('./agents/')
        torch.save(agent.dqn_net.state_dict(), f'./agents/DDDQN_{remark}.pth')
    
    return scores

In [None]:
scores = train_agent(num_episodes=NUM_EPISODES, remark=REMARK, max_time=len(env.data)-1, 
                     eps_start=EPS_START, eps_end=EPS_END, eps_decay=EPS_DECAY, fc=FC)

In [None]:
plot_loss_reward(scores)

## Validation

In [None]:
# visualize the result from randomized action

plot_result(Environment2(train_data), Environment2(valid_data), date_split, random=True, algorithm_name='Random')

In [None]:
# visualize the result from trained agent

model_path = f'./agents/DDDQN_{REMARK}.pth'

agent = DDDQNAgent(state_size=env.history_t+1, action_size=3, seed=0, fcl_units=FC1_UNITS, fc2_units=FC2_UNITS, fc=FC)
agent.dqn_net.load_state_dict(torch.load(model_path))

plot_result(Environment2(train_data), Environment2(valid_data), date_split, 
            random=False, algorithm_name='DDDQN', agent=agent)