In [33]:
##Readme: This notebook is here to analyze failure cases of the notears VAR approach. We first generate some data. 
#Then we normalize it to be 0-mean and 1std and run no tears. So far that has fixed the problem for all data that
#I tried


## Generating the data
import numpy as np

#That was my original random seed that gave me the wild data
np.random.seed(3)
dim = 3
n = 20
A = np.random.normal(0, 2, size = (dim, dim))
A = np.tril(A, k = 0)

# IMPORTANT: Here we force the only non-negative diagonal entry to be negative. This gives the wrong behavior
A[0, 0] = -3.4

# Here we can permute A if we want
# I = np.identity(dim)
# P = np.random.permutation(I)
# A = P.T @ A @ P

X_ini = np.zeros([n,dim])
X_ini[0] = np.random.normal(0, 2, size=(1, dim))

for k in range(n - 1):
    X_ini[k + 1] = X_ini[k] @ A #+np.random.normal(0, 0.5, size=(1, dim)) #I start with the no noise case
    
print('This is our A\n', A)

This is our A
 [[-3.4         0.          0.        ]
 [-3.72698541 -0.55477641  0.        ]
 [-0.16548296 -1.25400135 -0.08763634]]


In [35]:
#We normalize the data to be 0 mean and 1std

N1 = X_ini.std(axis=0)
N2 = X_ini.mean(axis=0)
X = (X_ini - N2) / N1

# Doing this we also have the change the true A. First, we have to add a bias, as we add a constant to our feature X,
# we now have an affine model X_t=X_{t-1}@A+C. We can model that by adding a row to A (called bias here) and add 
# a constant 1 column to X. We will have to do changes accordingly in the notears algorithm

bias = N2.reshape(1, 3) / N1 - ((N2 @ A).T / N1).T #looks a bit wild, but it's really just going through the motions

# Entries of A change as well
A = A.T * N1
A = A.T / N1

# As said before, we have to add the constant 1 to actually take the bias into account
# I will for now implement that implicitly in the notears by checking if we have that extra dimension, so it
# works with affine and linear models

X = np.concatenate((X, np.ones((n, 1))), axis = 1)
A = np.concatenate((A, -bias),axis=0)

print('This is our A after normalizing\n', A)

This is our A after normalizing
 [[-3.40000000e+00  0.00000000e+00  0.00000000e+00]
 [-2.31551006e-10 -5.54776405e-01  0.00000000e+00]
 [-6.48305992e-12 -7.90743759e-01 -8.76363380e-02]
 [-7.36752155e-01  2.13131794e-01 -2.27570025e-01]]


In [36]:
#Checking that we actually changed A correctly
t = 18
print(X[t+1], X[t] @ A)

[ 4.16639254  0.24344874 -0.20923356  1.        ] [ 4.16639254  0.24344874 -0.20923356]


In [37]:
## This is just to initialize all functions for some error analysis later, no need to read it.
n, d = X.shape
rho, alpha, h =  1.0, 0.0, np.inf  # double w_est into (w_pos, w_neg)

lambda1 = 0
loss_type = 'l2'

bnds = [ (0, None) for _ in range(2) for i in range(d) for j in range(d)]
def _loss(W):
    """Evaluate value and gradient of loss."""
    X1=np.delete(X,-1,0)
    X2=np.delete(X[:,:-1],0,0)
    M = X1 @ W
    if loss_type == 'l2':
        R = X2 - M
        loss = 0.5 / X1.shape[0] * (R ** 2).sum()
        G_loss = - 1.0 / X1.shape[0] * X1.T @ R
    #Not changed yet, anyway only works for binary input, not relevant
    elif loss_type == 'logistic':
        loss = 1.0 / X1.shape[0] * (np.logaddexp(0, M) - X * M).sum()
        G_loss = 1.0 / X.shape[0] * X.T @ (sigmoid(M) - X)
    #Also not changed, not sure what poisson loss is
    elif loss_type == 'poisson':
        S = np.exp(M)
        loss = 1.0 / X.shape[0] * (S - X * M).sum()
        G_loss = 1.0 / X.shape[0] * X.T @ (S - X)
    else:
        raise ValueError('unknown loss type')
    return loss, G_loss

def _h(W):
    """Evaluate value and gradient of acyclicity constraint."""
    W=W[:-1,:]
    V=W * W-np.diag(np.diag(W * W))
    E = slin.expm(V)  # (Zheng et al. 2018)
    h = np.trace(E) -d# -np.trace(W*W)
    #     # A different formulation, slightly faster at the cost of numerical stability
    #     M = np.eye(d) + W * W / d  # (Yu et al. 2019)
    #     E = np.linalg.matrix_power(M, d - 1)
    #     h = (E.T * M).sum() - d
    G_h = E.T * V * 2#-2*np.diag(np.diag(W))





    return h, G_h

def _adj(w):
    """Convert doubled variables ([2 d^2] array) back to original variables ([d, d] matrix)."""
    return (w[:(d-1)**2+d-1] - w[(d-1)**2+d-1:]).reshape([d, d-1])

def _func(w):
    """Evaluate value and gradient of augmented Lagrangian for doubled variables ([2 d^2] array)."""
    W = _adj(w)
    loss, G_loss = _loss(W)
    h, G_h = _h(W)
    G_h=np.concatenate((G_h,np.zeros((1,d-1))),axis=0)
    obj = loss + 0.5 * rho * h * h + alpha * h + lambda1 *(w[:(d-1)**2].sum()+w[(d-1)**2+d-1:2*(d-1)**2+d-1].sum())
    G_smooth = G_loss + (rho * h + alpha) * G_h
    g_obj = np.concatenate((G_smooth + lambda1, - G_smooth + lambda1), axis=None)
    return obj, g_obj

In [38]:
import os

path = "C:/Users/s165048/OneDrive - TU Eindhoven/QuinceyFinalProject/final-project/src/Week 12/notears/notears"
os.chdir(path)

import utils

In [69]:
## This actually runs the optimization. NOTE: This now only works for the affine model, so X needs the extra column of ones.
import numpy as np
import scipy.linalg as slin
import scipy.optimize as sopt
import argparse
import warnings
from scipy.special import expit as sigmoid


def notears_linear(X, lambda1, loss_type, max_iter=10, h_tol=1e-8, rho_max=1e+16, w_threshold=0.0):
    """Solve min_W L(W; X) + lambda1 ‖W‖_1 s.t. h(W) = 0 using augmented Lagrangian.

    Args:
        X (np.ndarray): [n, d] sample matrix
        lambda1 (float): l1 penalty parameter
        loss_type (str): l2, logistic, poisson
        max_iter (int): max num of dual ascent steps
        h_tol (float): exit if |h(w_est)| <= htol
        rho_max (float): exit if rho >= rho_max
        w_threshold (float): drop edge if |weight| < threshold

    Returns:
        W_est (np.ndarray): [d, d] estimated DAG
    """
    def _loss(W):
        """Evaluate value and gradient of loss."""
        X1=np.delete(X,-1,0)
        X2=np.delete(X[:,:-1],0,0)
        M = X1 @ W
        if loss_type == 'l2':
            R = X2 - M
            loss = 0.5 / X1.shape[0] * (R ** 2).sum()
            G_loss = - 1.0 / X1.shape[0] * X1.T @ R
        #Not changed yet, anyway only works for binary input, not relevant
        elif loss_type == 'logistic':
            loss = 1.0 / X1.shape[0] * (np.logaddexp(0, M) - X * M).sum()
            G_loss = 1.0 / X.shape[0] * X.T @ (sigmoid(M) - X)
        #Also not changed, not sure what poisson loss is
        elif loss_type == 'poisson':
            S = np.exp(M)
            loss = 1.0 / X.shape[0] * (S - X * M).sum()
            G_loss = 1.0 / X.shape[0] * X.T @ (S - X)
        else:
            raise ValueError('unknown loss type')
        return loss, G_loss

    def _h(W):
        """Evaluate value and gradient of acyclicity constraint."""
        W=W[:-1,:]
        V=W * W-np.diag(np.diag(W * W))
        E = slin.expm(V)  # (Zheng et al. 2018)
        h = np.trace(E) -d# -np.trace(W*W)
        #     # A different formulation, slightly faster at the cost of numerical stability
        #     M = np.eye(d) + W * W / d  # (Yu et al. 2019)
        #     E = np.linalg.matrix_power(M, d - 1)
        #     h = (E.T * M).sum() - d
        G_h = E.T * V * 2#-2*np.diag(np.diag(W))

      
  
 
        
        return h, G_h

    def _adj(w):
        """Convert doubled variables ([2 d^2] array) back to original variables ([d, d] matrix)."""
        return (w[:(d-1)**2+d-1] - w[(d-1)**2+d-1:]).reshape([d, d-1])

    def _func(w):
        """Evaluate value and gradient of augmented Lagrangian for doubled variables ([2 d^2] array)."""
        W = _adj(w)
        loss, G_loss = _loss(W)
        h, G_h = _h(W)
        G_h=np.concatenate((G_h,np.zeros((1,d-1))),axis=0)
        obj = loss + 0.5 * rho * h * h + alpha * h + lambda1 *(w[:(d-1)**2].sum()+w[(d-1)**2+d-1:2*(d-1)**2+d-1].sum())
        G_smooth = G_loss + (rho * h + alpha) * G_h
        g_obj = np.concatenate((G_smooth + lambda1, - G_smooth + lambda1), axis=None)
        return obj, g_obj

    n, d = X.shape
    w_est, rho, alpha, h = np.zeros(2 * ((d-1)**2+d-1)), 1.0, 0.0, np.inf  # double w_est into (w_pos, w_neg)
    #bnds = [ (0, None) for _ in range(2) for i in range(d) for j in range(d)]
    for _ in range(max_iter):
        w_new, h_new = None, None
        while rho < rho_max:
            sol = sopt.minimize(_func, w_est, method='L-BFGS-B', jac=True )
            w_new = sol.x
            h_new, _ = _h(_adj(w_new))
            w_est, h = w_new, h_new
            alpha += rho * h
            if h_new > 0.25 * h:
                rho *= 10
            else:
                break
                
        print(h)
        if np.abs(h) <= h_tol or rho >= rho_max:
            break
    W_est = _adj(w_est)
    W_est[np.abs(W_est) < w_threshold] = 0
    return W_est,w_est

def get_input():
    """Reads an CSV into the data matrix X"""
    input = argparse.ArgumentParser(description='Reads CSV into data matrix X')
    input.add_argument('--X', default=None, help='Pass a CSV file that corresponds to input data X')
    args = input.parse_args()
    return np.genfromtxt(args.X+'.csv',delimiter=',')

if __name__ == '__main__':
    #warnings.simplefilter('error')
    import utils
    utils.set_random_seed(1)
    ## Uncomment the following to run the example
    # n, d, s0, graph_type, sem_type = 100, 20, 20, 'ER', 'gauss'
    # B_true = utils.simulate_dag(d, s0, graph_type)
    # W_true = utils.simulate_parameter(B_true)
    # np.savetxt('W_true.csv', W_true, delimiter=',')

    # X = utils.simulate_linear_sem(W_true, n, sem_type)
    # np.savetxt('X.csv', X, delimiter=',')
    W_est,w_est = notears_linear(X, lambda1=0, loss_type='l2')
    #assert utils.is_dag(W_est)
    #B_true=np.genfromtxt('W_est.csv',delimiter=',')
    print('DONE')
    #np.savetxt('W_est.csv', W_est.T, delimiter=',')
 
    #acc = utils.count_accuracy(B_true, W_est != 0)
    #print(acc)



-0.9999999011250074
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
-0.12265458183390532
DONE


In [70]:
print('True matrix\n',A)
print('Estimated matrix\n',W_est)
B=A.reshape(((d-1)**2+d-1,))
C=np.concatenate((B.clip(min=0),-B.clip(max=0)),axis=0)
#Computing actual objective of the values
val1,_=_func(C)
val2,_=_func(w_est)
print('Value of true matrix',val1,'Value of estimated matrix',val2)
#We see that the actual matrix achieves a much lower value in the objective, so ideally the algorithm would find it
#So, instead is W_est a local minimum, let's check the gradient of the objective
_,grad=_func(w_est)
#Gradient
print('Gradient of w_est',grad[:(d-1)**2]+grad[(d-1)**2+d-1:2*(d-1)**2+d-1])
print('Loss of true matrix',_loss(A)[0],'Loss of estimated matrix',_loss(W_est)[0])


True matrix
 [[-3.40000000e+00  0.00000000e+00  0.00000000e+00]
 [-2.31551006e-10 -5.54776405e-01  0.00000000e+00]
 [-6.48305992e-12 -7.90743759e-01 -8.76363380e-02]
 [-7.36752155e-01  2.13131794e-01 -2.27570025e-01]]
Estimated matrix
 [[-3.3103295  -0.0697375  -0.116879  ]
 [ 0.04477571 -0.59422257 -0.92507845]
 [-0.35883158 -0.97753726 -1.04441649]
 [-0.74285576  0.05889454 -0.12959465]]
Value of true matrix 0.5 Value of estimated matrix 0.2228804268653325
Gradient of w_est [0. 0. 0. 0. 0. 0. 0. 0. 0.]
Loss of true matrix 1.2229655146858948e-32 Loss of estimated matrix 0.2153583536429074
