In [1]:
import numpy as np
import copy
#import numba
import random
from scipy import sparse
import multiprocessing as mp

import matplotlib.pyplot as plt

In [2]:
def generate_data(ndata, nv, acc, deg, nlf):
    cardinality = 2 * np.ones(nv, np.int64)     # all vocab terms are binary

    Y = np.empty(ndata, np.int64)        # generate the vocab samples
    V = np.empty((ndata, nv), np.int64)   # generate the vocab samples
    for i in range(ndata):
        r = random.random()
        Y[i] = 1 if (random.random() < 0.5) else -1
        for j in range(nv):
            V[i, j] = Y[i] if (random.random() < acc) else -Y[i]
            if V[i, j] == -1:
                V[i, j] = 0
            
    L = np.empty((ndata, nlf), np.int64)
    for i in range(deg):
        L[:, i] = 2 * np.maximum(V[:, i], V[:, -1]) - 1
    for i in range(deg, nlf):
        L[:, i] = 2 * V[:, i] - 1
    
    # calculate mu's here
    accuracies = np.array([np.mean(L[:,i] == Y) for i in range(nlf)])
    mu_from_acc = 2*accuracies - 1
        
    return Y, V, L, cardinality, mu_from_acc

In [3]:
Y, V, L, cardinality, mu = generate_data(10000, 11, 0.75, 9, 10)

In [4]:
print(L.shape)
print(mu.shape)

(10000, 10)
(10,)


In [5]:
O = np.dot(L.T,L)/(9999)
print(O.shape)
O_inv = np.linalg.inv(O)
sig = O - np.outer(mu,mu)

(10, 10)


In [6]:
sig_inv = np.linalg.pinv(sig)

In [7]:
np.set_printoptions(precision=3)
print(sig_inv)

[[ 2.407e+00 -2.404e-01 -2.795e-01 -2.578e-01 -2.371e-01 -3.110e-01
  -2.562e-01 -3.041e-01 -2.658e-01  1.360e-02]
 [-2.404e-01  2.414e+00 -2.969e-01 -2.715e-01 -2.495e-01 -2.520e-01
  -2.321e-01 -3.362e-01 -2.559e-01  1.140e-02]
 [-2.795e-01 -2.969e-01  2.435e+00 -2.524e-01 -2.621e-01 -2.416e-01
  -2.505e-01 -2.893e-01 -2.970e-01 -1.508e-02]
 [-2.578e-01 -2.715e-01 -2.524e-01  2.366e+00 -2.666e-01 -3.006e-01
  -3.067e-01 -2.243e-01 -2.008e-01  1.879e-02]
 [-2.371e-01 -2.495e-01 -2.621e-01 -2.666e-01  2.400e+00 -2.880e-01
  -2.706e-01 -3.058e-01 -2.504e-01 -2.435e-02]
 [-3.110e-01 -2.520e-01 -2.416e-01 -3.006e-01 -2.880e-01  2.456e+00
  -2.855e-01 -2.239e-01 -2.923e-01 -5.396e-04]
 [-2.562e-01 -2.321e-01 -2.505e-01 -3.067e-01 -2.706e-01 -2.855e-01
   2.394e+00 -2.768e-01 -2.476e-01  2.777e-03]
 [-3.041e-01 -3.362e-01 -2.893e-01 -2.243e-01 -3.058e-01 -2.239e-01
  -2.768e-01  2.512e+00 -2.881e-01 -3.719e-02]
 [-2.658e-01 -2.559e-01 -2.970e-01 -2.008e-01 -2.504e-01 -2.923e-01
  -2.476e-01

In [8]:
J_clean = copy.deepcopy(sig_inv)
for i in range(sig_inv.shape[0]):
    for j in range(sig_inv.shape[1]):
        if abs(sig_inv[i,j]) < 0.1:
            J_clean[i,j] = 0
        else:
            J_clean[i,j] = sig_inv[i,j]

In [9]:
print(J_clean)

[[ 2.407 -0.24  -0.279 -0.258 -0.237 -0.311 -0.256 -0.304 -0.266  0.   ]
 [-0.24   2.414 -0.297 -0.271 -0.249 -0.252 -0.232 -0.336 -0.256  0.   ]
 [-0.279 -0.297  2.435 -0.252 -0.262 -0.242 -0.251 -0.289 -0.297  0.   ]
 [-0.258 -0.271 -0.252  2.366 -0.267 -0.301 -0.307 -0.224 -0.201  0.   ]
 [-0.237 -0.249 -0.262 -0.267  2.4   -0.288 -0.271 -0.306 -0.25   0.   ]
 [-0.311 -0.252 -0.242 -0.301 -0.288  2.456 -0.285 -0.224 -0.292  0.   ]
 [-0.256 -0.232 -0.251 -0.307 -0.271 -0.285  2.394 -0.277 -0.248  0.   ]
 [-0.304 -0.336 -0.289 -0.224 -0.306 -0.224 -0.277  2.512 -0.288  0.   ]
 [-0.266 -0.256 -0.297 -0.201 -0.25  -0.292 -0.248 -0.288  2.38   0.   ]
 [ 0.     0.     0.     0.     0.     0.     0.     0.     0.     1.321]]


In [10]:
def generate_data_block_diag(ndata, acc, deg, nlf):
    assert(nlf % deg == 0)
    nv = nlf+deg
    cardinality = 2 * np.ones(nv, np.int64)     # all vocab terms are binary
    Y = np.empty(ndata, np.int64)        # generate the vocab samples
    V = np.empty((ndata, nv), np.int64)   # generate the vocab samples
    for i in range(ndata):
        r = random.random()
        Y[i] = 1 if (random.random() < 0.5) else -1
        for j in range(nv):
            V[i, j] = Y[i] if (random.random() < acc) else -Y[i]
            if V[i, j] == -1:
                V[i, j] = 0
            
    L = np.empty((ndata, nlf), np.int64)
    num_blocks = int(nlf/deg)
    for k in range(num_blocks):
        for i in range(k*deg,(k+1)*deg):
            L[:, i] = 2 * np.maximum(V[:, i], V[:, -k]) - 1
#         for i in range(deg, 2*deg):
#             L[:, i] = 2 * np.maximum(V[:, i], V[:, -3]) - 1
#         for i in range(2*deg, nlf):
#             L[:, i] = 2 * np.maximum(V[:, i], V[:, -2]) - 1
    
    # calculate mu's here
    accuracies = np.array([np.mean(L[:,i] == Y) for i in range(nlf)])
    mu_from_acc = 2*accuracies - 1
        
    return Y, V, L, cardinality, mu_from_acc, accuracies

In [11]:
n = 10000
m = 9
deg = 3
acc = 0.75

In [12]:
Y, V, L, cardinality, mu, accuracies = generate_data_block_diag(n,acc, deg, m)
O = np.dot(L.T,L)/(n-1)
O_inv = np.linalg.inv(O)
sig = O - np.outer(mu,mu)
sig_inv = np.linalg.pinv(sig)

J_clean = copy.deepcopy(sig_inv)
for i in range(sig_inv.shape[0]):
    for j in range(sig_inv.shape[1]):
        if abs(sig_inv[i,j]) < 0.15:
            J_clean[i,j] = 0
        else:
            J_clean[i,j] = sig_inv[i,j]
print(J_clean)

print(accuracies)

[[ 2.195 -0.863 -0.817  0.     0.     0.     0.     0.     0.   ]
 [-0.863  2.16  -0.51   0.    -0.168  0.     0.     0.     0.   ]
 [-0.817 -0.51   2.129  0.     0.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     2.034 -0.619 -0.65   0.     0.     0.   ]
 [ 0.    -0.168  0.    -0.619  2.001 -0.595  0.     0.     0.   ]
 [ 0.     0.     0.    -0.65  -0.595  2.062  0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.     0.     2.012 -0.659 -0.587]
 [ 0.     0.     0.     0.     0.     0.    -0.659  2.084 -0.687]
 [ 0.     0.     0.     0.     0.     0.    -0.587 -0.687  2.029]]
[0.75  0.75  0.75  0.752 0.752 0.761 0.746 0.747 0.745]


In [13]:
print(L)

[[-1 -1  1 ...  1  1  1]
 [-1 -1  1 ... -1  1 -1]
 [-1  1 -1 ... -1 -1 -1]
 ...
 [ 1  1  1 ...  1  1  1]
 [-1 -1 -1 ... -1 -1 -1]
 [-1 -1  1 ... -1 -1 -1]]


In [14]:
def generate_data_choice_blocks(ndata, acc, deg, nlf):
    assert(nlf % deg == 0)
    nv = nlf+deg
    cardinality = 2 * np.ones(nv, np.int64)     # all vocab terms are binary
    Y = np.empty(ndata, np.int64)        # generate the vocab samples
    V = np.empty((ndata, nv), np.int64)   # generate the vocab samples
    for i in range(ndata):
        r = random.random()
        Y[i] = 1 if (random.random() < 0.5) else -1
        for j in range(nv):
            V[i, j] = Y[i] if (random.random() < acc) else -Y[i]
            if V[i, j] == -1:
                V[i, j] = 0
            
    L = np.empty((ndata, nlf), np.int64)
    num_blocks = int(nlf/deg)
    blocks = np.random.choice(range(num_blocks), size=2, replace=False, p=None)
    for k in range(num_blocks):
        if k in blocks:
            for i in range(k*deg,(k+1)*deg):
                L[:, i] = 2 * np.maximum(V[:, i], V[:, -k]) - 1
        else:
            for i in range(k*deg,(k+1)*deg):
                L[:, i] = 2 * V[:, i] - 1
    
    # calculate mu's here
    accuracies = np.array([np.mean(L[:,i] == Y) for i in range(nlf)])
    mu_from_acc = 2*accuracies - 1
        
    return Y, V, L, cardinality, mu_from_acc, accuracies

In [15]:
Y, V, L, cardinality, mu, accuracies = generate_data_choice_blocks(n,acc, deg, m)
O = np.dot(L.T,L)/(n-1)
O_inv = np.linalg.inv(O)
sig = O - np.outer(mu,mu)
sig_inv = np.linalg.pinv(sig)

J_clean = copy.deepcopy(sig_inv)
for i in range(sig_inv.shape[0]):
    for j in range(sig_inv.shape[1]):
        if abs(sig_inv[i,j]) < 0.15:
            J_clean[i,j] = 0
        else:
            J_clean[i,j] = sig_inv[i,j]
print(J_clean)

print(accuracies)

[[ 2.091 -0.756 -0.744  0.161  0.     0.     0.     0.     0.   ]
 [-0.756  2.089 -0.588  0.    -0.153  0.     0.     0.     0.   ]
 [-0.744 -0.588  2.073 -0.154  0.    -0.15   0.     0.     0.   ]
 [ 0.161  0.    -0.154  1.99  -0.581 -0.641  0.     0.     0.   ]
 [ 0.    -0.153  0.    -0.581  2.018 -0.662  0.     0.     0.   ]
 [ 0.     0.    -0.15  -0.641 -0.662  2.036  0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.     0.     1.328  0.     0.   ]
 [ 0.     0.     0.     0.     0.     0.     0.     1.325  0.   ]
 [ 0.     0.     0.     0.     0.     0.     0.     0.     1.333]]
[0.751 0.751 0.749 0.752 0.755 0.75  0.748 0.747 0.75 ]


In [16]:
def generate_data_d_braid(ndata, acc, deg, nlf):
    assert(nlf % deg == 0)
    nv = nlf-deg+1
    cardinality = 2 * np.ones(nv, np.int64)     # all vocab terms are binary
    Y = np.empty(ndata, np.int64)        # generate the vocab samples
    V = np.empty((ndata, nv), np.int64)   # generate the vocab samples
    W = np.empty((ndata,nlf), np.int64)
    for i in range(ndata):
        r = random.random()
        Y[i] = 1 if (random.random() < 0.5) else -1
        for j in range(nv):
            V[i, j] = Y[i] if (random.random() < acc) else -Y[i]
            if V[i, j] == -1:
                V[i, j] = 0
        # for the individual variables
        for j in range(nlf):
            W[i,j] = Y[i] if (random.random() < acc) else -Y[i]
            if W[i, j] == -1:
                W[i, j] = 0
            
    L = np.empty((ndata, nlf), np.int64)
    for i in range(deg-1,nlf-deg+1):
        tmplist = []
        tmplist.append(W[:,i])
        for r in range(1,deg):
            tmplist.append(V[:,i-r])
        tmp = np.maximum.reduce(tmplist)
        L[:,i] = 2*tmp - 1
        
    for i in range(deg-1):
        tmplist = []
        tmplist.append(W[:,i])
        for r in range(i):
            tmplist.append(V[:,i+r])
        tmp = np.maximum.reduce(tmplist)
        L[:,i] = 2*tmp - 1
    
    base = nlf-deg
    for i in range(nlf-deg+1,nlf):
        tmplist = []
        tmplist.append(W[:,i])
        print("i: ", i)
        for r in range(nlf-i):
            print("base-r: ",base-r)
            #import pdb; pdb.set_trace()
            tmplist.append(V[:,base-r])
        tmp = np.maximum.reduce(tmplist)
        L[:,i] = 2*tmp - 1
            
    
    # calculate mu's here
    accuracies = np.array([np.mean(L[:,i] == Y) for i in range(nlf)])
    mu_from_acc = 2*accuracies - 1
        
    return Y, V, L, cardinality, mu_from_acc, accuracies

In [17]:
Y, V, L, cardinality, mu, accuracies = generate_data_d_braid(n,acc, deg, m)
O = np.dot(L.T,L)/(n-1)
print(O)
O_inv = np.linalg.inv(O)
sig = O - np.outer(mu,mu)
sig_inv = np.linalg.pinv(sig)

J_clean = copy.deepcopy(sig_inv)
for i in range(sig_inv.shape[0]):
    for j in range(sig_inv.shape[1]):
        if abs(sig_inv[i,j]) < 0.15:
            J_clean[i,j] = 0
        else:
            J_clean[i,j] = sig_inv[i,j]
print(J_clean)

print(accuracies)

i:  7
base-r:  6
base-r:  5
i:  8
base-r:  6
[[1.    0.253 0.194 0.203 0.208 0.214 0.205 0.214 0.255]
 [0.253 1.    0.574 0.574 0.398 0.405 0.418 0.407 0.391]
 [0.194 0.574 1.    0.6   0.475 0.471 0.482 0.466 0.399]
 [0.203 0.574 0.6   1.    0.611 0.474 0.472 0.465 0.403]
 [0.208 0.398 0.475 0.611 1.    0.591 0.49  0.469 0.41 ]
 [0.214 0.405 0.471 0.474 0.591 1.    0.612 0.476 0.399]
 [0.205 0.418 0.482 0.472 0.49  0.612 1.    0.603 0.404]
 [0.214 0.407 0.466 0.465 0.469 0.476 0.603 1.    0.565]
 [0.255 0.391 0.399 0.403 0.41  0.399 0.404 0.565 1.   ]]
[[ 1.326  0.     0.     0.     0.     0.     0.     0.     0.   ]
 [ 0.     1.908 -0.536 -0.584  0.     0.     0.     0.     0.   ]
 [ 0.    -0.536  1.944 -0.531  0.    -0.18  -0.186 -0.161  0.   ]
 [ 0.    -0.584 -0.531  2.229 -0.774  0.     0.     0.     0.   ]
 [ 0.     0.     0.    -0.774  2.039 -0.641  0.    -0.152  0.   ]
 [ 0.     0.    -0.18   0.    -0.641  2.02  -0.712  0.     0.   ]
 [ 0.     0.    -0.186  0.     0.    -0.712  