In [None]:
import time
import copy
import numpy as np
import pandas as pd
import random
from collections import namedtuple, deque
import os
import math
from itertools import count
import pdb
import warnings

from visualize_helper import *
from stock_environment import *

pd.set_option('display.max_columns', 500)
warnings.simplefilter('ignore', UserWarning)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.distributions import Categorical
from torch.autograd import Variable

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [None]:
BUFFER_SIZE = 200 #500
BATCH_SIZE = 20 #100
GAMMA = 0.97
FC1_UNITS = 100
FC2_UNITS = 100

TAU = 1e-3 # for soft update of target parameters
LR = 5e-4
PRINT_EVERY = 5
UPDATE_EVERY = 5

NUM_EPISODES = 100
REMARK  = 'env_2_change_update_every_5_for_episode_100'

## Read Data

In [None]:
df = pd.read_csv('../df_apple.csv')

In [None]:
df.head()

In [None]:
df = df.set_index('Date')

In [None]:
df.head()

## Visualize Data

In [None]:
# split data into train and valid data for visualization
date_split = '2018-01-02'

# visualize the data
plot_train_test(df, date_split)

## Split Data into Train and Valid Data

In [None]:
# Get training and valid data
train_data = df[df['year']<=2017]
valid_data = df[df['year']>2017]

train_data = train_data.drop(['year'],axis=1)
valid_data = valid_data.drop(['year'],axis=1)

print('Train Size: ', train_data.shape[0])
print('Valid Size: ', valid_data.shape[0])

## Introduce Environment

Environment information have been written in *stock_environment.py*.

In [None]:
env = Environment2(train_data)
print(env.reset())
for _ in range(10):
    pact = np.random.randint(3)
    print('action: ', pact)
    print(env.step(pact))

## Policy Gradients Implementation

In [None]:
class PolicyNet(nn.Module):
    """Define Policy Gradients architecture."""
    
    def __init__(self, state_size, action_size, seed, fcl_units, fc2_units):
        """Initialize parameters and build model."""
        
        super(PolicyNet, self).__init__()
        
        self.seed = torch.manual_seed(seed)

        self.fc1_layer = nn.Linear(state_size, fcl_units)
        self.fc2_layer = nn.Linear(fcl_units, fc2_units)
        self.fc3_layer = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state into action probability."""
        
        hidden = F.relu(self.fc1_layer(state))
        hidden = F.relu(self.fc2_layer(hidden))
        pract = F.softmax(self.fc3_layer(hidden)) # probablity of action 
        return pract

## PG Agent

In [None]:
class PGAgent():
    """The agent interacting with and learning from the environment."""
    
    def __init__(self, state_size, action_size, seed, fcl_units=64, fc2_units=64):
        """Initialize an agent object."""
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        # policy network
        self.policy_net = PolicyNet(self.state_size, self.action_size, seed, fcl_units, fc2_units).to(device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LR)
        
    def act(self, state):
        
        # state is stored in list.
        state = torch.FloatTensor(state).float().unsqueeze(0).to(device)
        
        probs = self.policy_net(state)
        #print('probs: ', probs)
        
        distributions = Categorical(probs)
        
        action = distributions.sample()
        
        return action, distributions

    def update(self, state_pool, action_pool, prob_pool, reward_pool, steps, gamma):
        """Update value parameters using given batch of experience tuples."""
    
        # Discount reward
        running_add = 0
        
        for i in reversed(range(steps)):
            if reward_pool[i] == 0:
                running_add = 0
            else:
                # Action value function - Expected return by taking action a in state s following policy
                # Discount future rewards back to the present using gamma
                running_add = running_add * gamma + reward_pool[i]
                reward_pool[i] = running_add

        # Normalize reward
        reward_mean = np.mean(reward_pool)
        reward_std = np.std(reward_pool)
        
        for i in range(steps):
            # normalization - compensating for future uncertainty
            reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std

        self.optimizer.zero_grad()

        for i in range(steps):
            state = state_pool[i]
            action = Variable(torch.FloatTensor([action_pool[i]])).to(device)
            prob = prob_pool[i]
            reward = reward_pool[i]
            
            #### Monte-Carlo Policy Gradient #### 
            # gradient ascent
            loss = -prob.log_prob(action) * reward  # Negtive score function x reward

            loss.backward()

        self.optimizer.step() # update weights and biases based on the computed gradients

In [None]:
agent = PGAgent(state_size=env.history_t+1, action_size=3, seed=0, fcl_units=FC1_UNITS, fc2_units=FC2_UNITS)

In [None]:
# watch an untrained agent
state = env.reset()
for time_step in range(200):
    
    # select an action with highest probability
    action, prob = agent.act(state)

    next_state, reward, done = env.step(action)
    state = next_state
    if done:
        break

## Train the Agent

In [None]:
def train_agent(num_episodes, remark='default', max_time=1000, batch_size=20, update_every=4, gamma=0.99, save=True):
   
    """Train PG agent."""
    
    # Batch History
    state_pool = []
    action_pool = []
    prob_pool = []
    reward_pool = []
   
    
    scores = []
    scores_window = []
    steps_done = 0
    start = time.time()
    
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        score = 0
        
        for time_step in range(max_time):
            
            # select an action with highest probability
            action, prob = agent.act(state)
            
            next_state, reward, done = env.step(action)
            
            state_pool.append(state)
            action_pool.append(float(action))
            prob_pool.append(prob)
            reward_pool.append(reward)
            
            state = next_state
            score += reward
            
            steps_done += 1
            
            if done:
                break
                
        scores_window.append(score)
        scores.append(score)
        
        # Update policy
        if i_episode > 0 and i_episode % UPDATE_EVERY == 0:
            agent.update(state_pool, action_pool, prob_pool, reward_pool, steps_done, gamma)
            
        if i_episode % PRINT_EVERY == 0:
            avg_score = np.mean(scores_window)
            elapsed_time = time.time()-start
            print(f'Episode: {i_episode}, Average Score: {avg_score:.2f}, Elapsed Time: {elapsed_time:.3f}')
            start = time.time()
            
        state_pool = []
        action_pool = []
        prob_pool = []
        reward_pool = []
        steps_done = 0
        
    print('Training completed.')
    
    if save:
        if not os.path.exists('./agents/'): 
            os.makedirs('./agents/')
        torch.save(agent.policy_net.state_dict(), f'./agents/PG_{remark}.pth')
    
    return scores

In [None]:
scores = train_agent(num_episodes=NUM_EPISODES, remark=REMARK, max_time=len(env.data)-1, 
                     batch_size=BATCH_SIZE, update_every = UPDATE_EVERY, gamma=GAMMA)

In [None]:
plot_reward(scores)

## Validation

In [None]:
# visualize the result from randomized action

plot_result(Environment2(train_data), Environment2(valid_data), date_split, random=True, algorithm_name='Random')

In [None]:
# visualize the result from trained agent

model_path = f'./agents/PG_{REMARK}.pth'

agent = PGAgent(state_size=env.history_t+1, action_size=3, seed=0, fcl_units=FC1_UNITS, fc2_units=FC2_UNITS)
agent.policy_net.load_state_dict(torch.load(model_path))

plot_result(Environment2(train_data), Environment2(valid_data), date_split, 
            random=False, algorithm_name='PG', agent=agent)

**Reference:**

1. https://github.com/Finspire13/pytorch-policy-gradient-example/blob/master/pg.py
2. https://github.com/zafarali/policy-gradient-methods/blob/master/pg_methods/policies.py