In [2]:
import tensorflow as tf
import tflearn
import numpy as np
import os


class DPG_S:
    
    def __init__(self, feature_number, num_assets, window_size, sess, optimizer,
                 trading_cost, interest_rate, LogReturn = False, layer_type = 'Conv'):

        # parameters
        self.trading_cost = trading_cost
        self.interest_rate = interest_rate
        self.num_features = feature_number
        self.n = window_size
        self.m = num_assets
        self.LogReturn = LogReturn
        self.layer_type = layer_type
        
        # Tensor of the prices
        self.X_t = tf.placeholder(tf.float32, [None, self.num_features, self.m, self.n]) 
        self.batch_size = tf.shape(self.X_t)[0]                         # Batch size
        self.W_previous = tf.placeholder(tf.float32, [None, self.m+1])  # w'_{t-1}
        self.action = self.build_net()  # Returns the output of imputing X_t and w_previous to the NN
        
        self.pf_value_previous = tf.placeholder(tf.float32, [None, 1])   # p'_{t-1} 
        self.dailyReturn_t = tf.placeholder(tf.float32, [None, self.m])  # y_t = Open(t+1)/Open(t)
        constant_return = tf.constant(1+self.interest_rate, shape=[1, 1])            # Interest rate given by cash
        self.cash_return = tf.tile(constant_return, tf.stack([self.batch_size, 1]))  # Interest rate is the sae for all samples
        self.y_t = tf.concat([self.cash_return, self.dailyReturn_t], axis=1)         # Daily returns considering the cash return
        self.loss_function = self.loss_function()
        
        # Objective function: maximize reward over the batch (min(-r) = max(r))
        self.train_op = optimizer.minimize(-self.loss_function)
        self.optimizer = optimizer
        self.sess = sess
        
        
    def build_net(self):
        state = tf.transpose(self.X_t, [0, 2, 3, 1])  # Reshape [Batches, Assets, Periods, Features]
        if self.layer_type == 'Conv':
            network = tflearn.layers.conv_2d(state, 2,
                                             [1, 2],
                                             [1, 1, 1, 1],
                                             'valid',
                                             'relu')
        else:
            resultlist = []
            reuse = False
            neuron_number = 20
            for i in range(state.get_shape()[1]):
                if i > 0:
                    reuse = True
                if self.layer_type == "LSTM":
                    result = tflearn.layers.lstm(state[:, :, :, i],
                                                 neuron_number,                
                                                 dropout = 0.6,
                                                 scope = "lstm",
                                                 reuse = reuse)
                elif self.layer_type == 'RNN':
                    result = tflearn.layers.simple_rnn(network[:, :, :, i],
                                                       neuron_number,
                                                       dropout = 0.6,
                                                       scope="rnn",
                                                       reuse=reuse)
                resultlist.append(result)
            network = tf.stack(resultlist)
            network = tf.transpose(network, [1, 0, 2])
            network = tf.reshape(network, [-1, state.get_shape()[1], 1, neuron_number]) 

            
            
        width = network.get_shape()[2]
        network = tflearn.layers.conv_2d(network, 48,
                                         [1, width],
                                         [1, 1],
                                         "valid",
                                         'relu',
                                         regularizer="L2",
                                         weight_decay=5e-9)
        w_previous = self.W_previous[:, 1:]
        network=tf.concat([network,tf.reshape(w_previous, [-1, self.m, 1, 1])],axis=3)
        network = tflearn.layers.conv_2d(network, 1,
                                         [1, network.get_shape()[2]],
                                         [1, 1],
                                         "valid",
                                         'relu',
                                         regularizer="L2",
                                         weight_decay=5e-9)
        network = network[:, :, 0, 0]  # Squeeze diensions [Batchs, assets, 1, 1] = [Batches, Assets]
#         with tf.variable_scope("cash_bias", reuse=tf.AUTO_REUSE):
#             bias = tf.get_variable("cash_bias", [1, 1], dtype=tf.float32, initializer=tf.zeros_initializer)
        if self.LogReturn:
            bias = tf.get_variable("cash_bias_log", [1, 1], dtype=tf.float32, initializer=tf.zeros_initializer)
        else:
            bias = tf.get_variable("cash_bias_simple", [1, 1], dtype=tf.float32, initializer=tf.zeros_initializer)
        #bias = tf.get_variable('cash_bias', shape=[1, 1], initializer=tf.constant_initializer(0.7))
        cash_bias = tf.tile(bias, tf.stack([self.batch_size, 1]))
        network = tf.concat([cash_bias, network], 1)          # concatenates adding cols (the number of rows does not change)
        self.voting = network                                 # voting scores
        action = tf.nn.softmax(network)
#         network=tf.layers.flatten(network)
#         w_init = tf.random_uniform_initializer(-0.005, 0.005)
#         action = tf.layers.dense(network, self.m, activation=tf.nn.softmax, kernel_initializer=w_init)

        return action

    def loss_function(self):
        if self.LogReturn:

            # PROFIT VECTOR: P_t/P_{t-1} = exp(r_t) = sum over the assets (action*y_t) 
            # profit_vector = (y_t1 * w_t1,..., y_tn * w_tn) tn = t1 + batch_size = last period (sample in the batch)
            self.profit_vector = tf.reduce_sum(self.action * self.y_t, reduction_indices=[1]) * self.compute_mu() 

            # PROFIT: P(t)/P(t-bs)=exp(sum(_(t-bs)^t) r_t) = (prod(_(t-bs)^t)w_t*y_t) profit obtained after each batch
            self.profit = tf.reduce_prod(self.profit_vector)          # ultiplies all the elements of profit_vector
            self.mean = tf.reduce_mean(self.profit_vector)            # Mean of the portfolio value vector (through all the batches)
            self.reward = tf.reduce_mean(tf.log(self.profit_vector))  # Cumulated return (eq 22)
            loss_function = self.set_loss_function()                  # Loss function to train the NN

            # Evaluate performance
            self.standard_deviation = tf.sqrt(tf.reduce_mean((self.profit_vector - self.mean) ** 2))
            self.sharp_ratio = (self.mean - 1) / self.standard_deviation
            
#             # Another way that seems to work pretty well:
#             w_eq = np.array(np.array([1/(self.m+1)]*(self.m+1)))
#             profit_vector_eq = tf.reduce_sum(w_eq * self.y_t, reduction_indices=[1])
#             profit_eq = tf.reduce_prod(profit_vector_eq) 
#             max_weight = tf.reduce_max(self.action)
#             ratio_regul = 0.1
#             loss_function = self.profit - profit_eq - ratio_regul * max_weight
            

        # Simple reward: r_t = (p_t-p_t-1)/p_t-1 = mu_t*y_t*w_t - 1 (w_t = action)
        else:   
            # Vector of the returns obtained for each period (r_t1, ..., r_tn) such that tn = t1+batch_size
            self.profit_vector = tf.reduce_sum(self.action * self.y_t, reduction_indices=[1]) * self.compute_mu() - 1
            
            # r_t = (p_t-p_t-1)/p_t-1 => p_t/p_t-1 = r_t + 1 => p_t = p_t-1(r_1 + 1)
            self.profit = tf.reduce_prod((1 + self.profit_vector))
            self.mean = tf.reduce_mean(self.profit_vector)          # Mean of the portfolio value vector (through all the batches)
            self.reward = tf.reduce_mean(self.profit)               # Cumulated return (eq 22)
            loss_function =  self.set_loss_function()               # Loss function to train the NN
            #loss_function =  tf.reduce_mean(self.profit_vector)     # Loss function to train the NN

            ## Evaluate performance
#             self.standard_deviation = tf.sqrt(tf.reduce_mean((self.profit_vector - self.mean) ** 2))
#             self.sharp_ratio = (self.mean - 1) / self.standard_deviation
            
#             # Another way that seems to work pretty well:
#             Vprime_t = self.action * self.pf_value_previous     # Asset values at the end of the period t before reallocating [Batches, assets]
#             Vprevious = self.W_previous*self.pf_value_previous  # Asset values at the beggining of the period [Batches, assets]

#             cost = self.trading_cost * tf.norm(Vprime_t-Vprevious, ord=1, axis=1) # Rank 1 [Batch]: one cost per sample (cost_t1,..., cost_tn)
#             cost = tf.expand_dims(cost, 1)                                        # One cost per sample (Rank 2) [Batch, 1]

#             # Create a cost vector for each sample: (cost_sample, 0, 0, ..., 0) 
#             # The batch cost tensor is composed for each of the sample vectors
#             zero = tf.constant(np.array([0.0]*self.m).reshape(1, self.m), shape=[1, self.m], dtype=tf.float32)
#             vec_zero = tf.tile(zero, tf.stack([self.batch_size, 1]))  # [Batches, Non cash assets]
#             vec_cost = tf.concat([cost, vec_zero], axis=1)            # [Batches, Non cash assets + cash] = [Batches, 1+m]

#             Vsecond_t = Vprime_t - vec_cost

#             V_t = tf.multiply(Vsecond_t, self.y_t)     # [Batches, 1+m] one vector of monetary value of assets per sample
#             self.portfolioValue = tf.norm(V_t, ord=1)  # Sum of V_t for each sample (value for each period) and for each batch
#             self.instantaneous_reward = (self.portfolioValue-self.pf_value_previous)/self.pf_value_previous  # simple return
            

#             # Calculate the reward obtained for an equiweighted portfolio so as to build the loss function
#             w_eq = np.array(np.array([1/(self.m+1)]*(self.m+1)))
#             V_eq = w_eq*self.pf_value_previous
#             V_eq_second = tf.multiply(V_eq, self.y_t)
#             self.portfolioValue_eq = tf.norm(V_eq_second, ord=1)
#             self.instantaneous_reward_eq = (self.portfolioValue_eq-self.pf_value_previous)/self.pf_value_previous

            # For regularizing loss function
            self.max_weight = tf.reduce_max(self.action)
            ratio_regul = 0.1
            # Minimizes the -loss_function: maxiizes the difference between the reward obtained by the agents action and 
            # the reward obtained by an agnt which invests same weight in each asset
            #loss_function = self.instantaneous_reward - self.instantaneous_reward_eq - ratio_regul*self.max_weight

        return loss_function
                
    
    # Transaction remainder factor 
    def compute_mu(self):
        c = self.trading_cost
        # Starts in [:,1:] to not consider the cash in the calculations
        return 1-tf.reduce_sum(tf.abs(self.action[:,1:]-self.W_previous[:,1:]),axis=1)*c  # [Batches,]
   
    
    # Define the loss function which is going to minimize the agent (so as to maximize the reward)
    # Keep in mind that what is going to be minimize is the -loss function (see self.train_op)
    def set_loss_function(self):
        LAMBDA = 1e-4 
        
        # Minimizes minus the portfolio value (maximizes the portfolio value)
        def loss_function1():
            if self.LogReturn:
                return tf.reduce_mean(tf.log(self.profit_vector))
            else: 
                return tf.reduce_mean(self.profit_vector)

        # Adds regularization
        def loss_function2():
            if self.LogReturn:
                return tf.reduce_mean(tf.log(self.profit_vector)) - \
                   LAMBDA * tf.reduce_mean(tf.reduce_sum(tf.log(1 + 1e-6 + self.action), reduction_indices=[1]))
            else: 
                return tf.reduce_mean(self.profit_vector) - \
                   LAMBDA * tf.reduce_mean(self.profit_vector)

        # Mean of the returns obtained minus the amount of money that takes to change the portfolio values 
        def with_last_w():
            if self.LogReturn:
                return tf.reduce_mean(tf.log(tf.reduce_sum(self.action[:] * self.y_t, reduction_indices=[1])
                                          -tf.reduce_sum(tf.abs(self.action[:, 1:] - self.W_previous[:,1:])
                                                         *self.trading_cost, reduction_indices=[1])))
            else:
                return tf.reduce_mean(self.profit_vector \
                                          - tf.reduce_sum(tf.abs(self.action[:, 1:] - self.W_previous[:,1:])
                                                         *self.trading_cost, reduction_indices=[1]))

        loss_function = loss_function1
        loss_tensor = loss_function()
        regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        if regularization_losses:
            for regularization_loss in regularization_losses:
                loss_tensor += regularization_loss
        return loss_tensor

    # Compute the agent's action   
    def compute_W(self, X_t_, W_previous_):
        return self.sess.run(tf.squeeze(self.action), feed_dict={self.X_t: X_t_, self.W_previous: W_previous_})
    
    def get_reward(self,  X_t_, W_previous_, pf_value_previous_, dailyReturn_t_):
        return self.sess.run(self.loss_function, feed_dict={self.X_t: X_t_,                             
                                                self.W_previous: W_previous_,
                                                self.pf_value_previous: pf_value_previous_,
                                                self.dailyReturn_t: dailyReturn_t_})

    # Train the NN maximizing the reward: the input is a batch of the differents values
    def train(self, X_t_, W_previous_, pf_value_previous_, dailyReturn_t_):
     
        self.sess.run(self.train_op, feed_dict={self.X_t: X_t_,                             
                                                self.W_previous: W_previous_,
                                                self.pf_value_previous: pf_value_previous_,
                                                self.dailyReturn_t: dailyReturn_t_})