<a href="https://colab.research.google.com/github/GiovaniValdrighi/NOTEARS/blob/master/notears_nonlinear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Notears não-linear
Implementação do algoritmo Notears nonlinear para o aprendizado de estruturas (DAG).

In [37]:
import tensorflow as tf
import numpy as np
import scipy.optimize as sopt
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


https://stackoverflow.com/questions/59029854/use-scipy-optimizer-with-tensorflow-2-0-for-neural-network-training

In [0]:
class Notears_MLP(tf.keras.models.Model):
  '''
  Class for the neural network used on NOTEARS non linear model
  
  Inputs:
    dims [int] - list of dimensions of hidden layers, the last dimension must be 1
    batch_size [int] - size of the batch for training
    bias [bool] - use of bias in model
  '''

  class bound_adj(tf.keras.constraints.Constraint):
    '''Class for fc1 weights constraints, the weights are non-negative and weights on diagonal are 0'''
    def __init__(self, n_variables, dims):
      self.n_variables = n_variables
      self.dims = dims
      return
    
    def __call__(self, w):
      w = w * tf.cast(tf.math.greater_equal(w, 0.), tf.float32)
      mask = tf.eye(self.n_variables, )
      for i in range(self.n_variables):
        for m in range(self.dims[1]):
          for j in range(self.n_variables):
            if i == j:
              pass
              #w[i + m + j] = 0. #não sei se é valido
      return w


  def __init__(self, dims, batch_size = 100, bias = True):
    super(Notears_MLP, self).__init__()
    self.dims = dims
    self.n_variables = dims[0]
    self.batch_size = batch_size
    #fc1 layer [d * m0]
    self.fc1_pos = tf.keras.layers.Dense(dims[0] * dims[1], input_shape = (batch_size, dims[0]), use_bias = bias)
    self.fc1_neg = tf.keras.layers.Dense(dims[0] * dims[1], input_shape = (batch_size, dims[0]), use_bias = bias)
    self.locally = []
    for i in range(len(dims) - 2):
      #fc2 layers [d, m1, m2]
      self.locally.append(tf.keras.layers.LocallyConnected1D(dims[i + 2], 1, input_shape = (batch_size, dims[0], dims[i + 1]), activation = 'sigmoid'))

  def bias_shape(self):
    '''Utility function for val_and_grad'''
    res = []
    res.append(self.fc1_pos.weights[1].shape)
    res.append(self.fc1_neg.weights[1].shape)
    for layer in self.locally:
      res.append(layer.weights[1].shape)
    return res

  def call(self, inputs):
    '''
    Forward procedure in the neural network, pass the inputs trought fc1 and fc2 layers

    Inputs:
      inputs [tensor] - tensor of samples with shape [batch_size, n_variables]

    Outputs:
      out [tensor] - tensor with shape [batch_size, n_variables]
    '''

    hid = self.fc1_pos(inputs) - self.fc1_neg(inputs) #[n, d * m0]
    out = tf.reshape(hid, (self.batch_size, self.n_variables, -1)) #[n, d, m0]
    for layer in self.locally:
      out = layer(out)
    out = tf.squeeze(out, 2) #[n, d, 1] -> [n, d]
    return out

  def flat_params(self):
    '''
    Return flat vector of params to Scipy minimize
    Order is: fc1_pos wegiths bias - fc1_neg weights bias - layers weights bias
    '''
    params = []
    params.append(tf.reshape(self.fc1_pos.weights[0], 80))
    params.append(self.fc1_pos.weights[1])
    params.append(tf.reshape(self.fc1_neg.weights[0], 80))
    params.append(self.fc1_neg.weights[1])
    for layer in self.locally:
      params.append(tf.reshape(layer.weights[0], -1))
      params.append(tf.reshape(layer.weights[1], -1))
    return tf.cast(tf.concat(params, axis = 0), tf.float64).numpy()

  def flat_bounds(self):
    '''
    Return flat vector of bounds to Scipy minimize
    Order is: fc1_pos wegiths bias - fc1_neg weights bias - layers weights bias
    '''
    bounds = []
    bounds_fc1 = []
    for i in range(self.n_variables):
      for m in range(self.dims[1]):
        for j in range(self.n_variables):
          if i == j:
            bounds_fc1.append((0,0))
          else:
            bounds_fc1.append((0, None))
    bounds.append(bounds_fc1)
    bounds.append([(None, None) for _ in range(20)])
    bounds.append(bounds_fc1)
    bounds.append([(None, None) for _ in range(20)])
    for layer in self.locally:
      bounds.append([(None, None) for _ in range(tf.math.reduce_prod(layer.weights[0].shape))])
      bounds.append([(None, None) for _ in range(tf.math.reduce_prod(layer.weights[1].shape))])
    return sum(bounds, [])

  def _h(self):
    '''Calculate the constraint of fc1 to ensure that it's a DAG'''
    fc1_weights = self.fc1_pos.weights[0] - self.fc1_neg.weights[0]
    fc1_weights = tf.reshape(fc1_weights, (self.n_variables, -1, self.n_variables))
    A = tf.transpose(tf.math.reduce_sum(fc1_weights, axis = 1))
    #(Yu et al. 2019 DAG-GNN)
    # h(w) = tr[(I + kA*A)^n_variables] - n_variables
    M = tf.eye(self.n_variables, num_columns = self.n_variables) + A/self.n_variables
    E = tf.pow(M, self.n_variables - 1)
    h = tf.math.reduce_sum(tf.transpose(E) * M) - self.n_variables
    return h
  
  def _l2_loss(self):
    '''Calculate L2 loss from model parameters'''
    loss = 0
    fc1_weights = self.fc1_pos.weights[0] - self.fc1_neg.weights[0]
    loss +=  tf.math.reduce_sum(tf.pow(fc1_weights, 2))
    for layer in self.locally:
      loss += tf.math.reduce_sum(tf.pow(layer.weights[0], 2))
    return loss

  def _l1_loss(self):
    '''Calculate L1 loss from fc1 parameters'''
    return tf.math.reduce_sum(self.fc1_pos.weights[0] + self.fc1_neg.weights[0])

  def to_adj(self):
    '''Reshape fc1 to an adjacency matrix'''
    fc1_weights = self.fc1_pos.weights[0] - self.fc1_neg.weights[0] #[d, d * m0]
    fc1_weights = tf.reshape(fc1_weights, (self.n_variables, -1, self.n_variables)) #[d, m0, d]
    return tf.transpose(tf.math.reduce_sum(fc1_weights, axis = 1)) #[d, d]


In [0]:
data = np.ones((100, 4))
data[:, 1] = data[:, 1] * 5 + np.random.randint(0, 10, size = (100))
data_tf = tf.constant(data, dtype = np.float32)

model = Notears_MLP([4, 5, 7, 1])
out = model(data_tf)

In [0]:
def notears_nonlinear(dims, X,  h_tol = 1e-4, threshold = 0.2, lambda1 = 0.5, lambda2 = 0.5, rho_max = 1e20, max_iter = 1e16):
  '''
    Function that apply the NOTEARS algorithm in a non linear model
    
    Args:
        dims (int) : list of dimensions for neural network
        X (numpy.matrix) : [n_samples, n_variables] samples matrix 
        h_tol (float) : tolerance for constraint, exit condition 
        threshold (float) : threshold for W_est edge values
        lambda1 (float) : L1 regularization parameter
        lambda2 (float) : L2 regularization parameter 
        rho_max (float) : max value for rho in augmented lagrangian
        max_iter (int) : max number of iterations
    Outputs:
        W_est (numpy.matrix): [n_variables, n_variables] estimated graph
    '''
  def square_loss(X, Y):
    '''Calculate mean square error from X Y'''
    n = X.shape[0]
    loss = 0.5 * tf.math.reduce_sum(tf.pow(X -Y, 2)) / n
    return loss

  def val_and_grad():
    '''Calculate loss value and gradient for Scipy optmize'''
    with tf.GradientTape() as tape:
      Y = model(X).numpy()
      mse_loss = square_loss(X, Y) 
      h = model._h()
      h_constraint = 0.5 * rho * h * h + alpha * h
      fc1_loss = lambda1 * model._l1_loss()
      locally_loss = 0.5 * lambda2 * model._l2_loss()
      loss = mse_loss + h_constraint + fc1_loss + locally_loss
    grad = tape.gradient(loss, model.trainable_variables)
    flat_grad = []
    i = 0
    for gradient in grad:
      if gradient != None:
        flat_grad.append(tf.reshape(gradient, -1).numpy())
      else:
        flat_grad.append(np.array([None for _ in range(tf.math.reduce_prod(model.bias_shape()[i]))]))
        i+=1
    return loss, grad_flat

  ########################
  # Optimization process #
  ########################
  model = Notears_MLP(dims)
  rho, alpha, h = 0., 0., np.inf
  for _ in range(max_iter):
    h_new = None
    while rho < rho_max:
      sol = sopt.minimize(val_and_grad, model.flat_params(), method = "L-BFGS-B", jac = True, bounds = model.flat_bounds())
      new_params = sol.x
      model.update_params(new_params)#fazer
      h_new = model._h().numpy()

      #Updating rho constraint parameter
      if h_new > h * c:
        rho = rho * 10
      else:
        break
    
    h = h_new

    #Ascent alpha
    alpha += rho * h

    #Verifying constraint tolerance
    if h < h_tol or rho >= rho_max:
      break

  #Applying threshold   
  W_est = model.to_adj().numpy()
  W_est[np.abs(W_est) < threshold] = 0
  return W_est  