In [1]:
import numpy as np
import torch
import torch.nn.functional as F

### Load Q, K, V

Generate random test data

In [200]:
psd_ratio = 1  # test how error increases as Q becomes less PSD
d = 64
e = 400
l = 400
batch = 10
seed = 12
assert l >= d
if seed:
    np.random.seed(seed)
    torch.manual_seed(seed)
arange_d = torch.arange(d, 0, -1)
psd_matrix = torch.Tensor(np.random.rand(d, d))
psd_matrix = torch.matmul(psd_matrix, psd_matrix)
psd_matrix = torch.cat([psd_matrix, torch.ones(l - d, d)], dim=0)
q = torch.Tensor(np.random.rand(l, d)) * (1 - psd_ratio) + psd_matrix * psd_ratio
k = torch.Tensor(np.random.rand(l, d)) * (1 - psd_ratio) + psd_matrix * psd_ratio

# make sure query vectors are not all zero (creates large error when solving for K_hat, and will never occur in practice)

q, k = q.type(torch.float), k.type(torch.float)
v = torch.Tensor(np.random.rand(l, d))

q, k, v = q[None].repeat(batch, 1, 1), k[None].repeat(batch, 1, 1), v[None].repeat(batch, 1, 1)

Or load q, k, v from GPT-2 run on WMT (see end of notebook for data collection)

In [244]:
e = 400

qkv = np.load('qkv.npz')
n_layer, n_sample, n_attn_heads, l, d = qkv['q'].shape  # [ layers X samples X attention heads X tokens X q dimension ]

sample_dims = np.random.randint(0, n_sample * n_attn_heads, (n_layer,))
sample_mask = torch.zeros(n_layer, n_sample * n_attn_heads).type(torch.bool)
sample_mask[np.arange(n_layer), sample_dims] = True

# sample along sample and attention head dimensions
q, k, v = (
    torch.Tensor(qkv[n]).reshape(n_layer, n_sample * n_attn_heads, l, d)[sample_mask]
    for n in ('q', 'k', 'v')
)

### Run decomposition
Approximate within-softmax QK decomposition to bring dimensionality reduction outside softmax

In [245]:
A = torch.softmax(q @ k.transpose(-1, -2), dim=-1)
out = A @ v

In [246]:
# decompose A into USD
U, s, D = torch.linalg.svd(A)
S = torch.diag_embed(s, dim1=1, dim2=2)

# take highest singular value vectors
U = U[:,:,:e]
S = S[:,:e,:e]
D = D[:,:e,:]

((A - U @ S @ D) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5

tensor([1.9609e-05, 2.1393e-05, 3.9042e-05, 2.2519e-05, 2.5699e-05, 2.6788e-05,
        3.5920e-05, 4.3862e-05, 2.0114e-05, 2.6681e-05, 2.4320e-05, 3.9945e-05])

In [248]:
# compute pseudo A
US = U @ S

# compute log offset
offset = 1 - US.min(dim=-1).values.min(dim=-1).values
US_ = US + offset[:,None,None]
M_offset = torch.ones_like(US_) * offset[:,None,None]
# M_offset = torch.linalg.lstsq(US_, US).solution  # doesnt work

# make rows of pseudo A sum to 1
rowsum = US_.sum(dim=-1)
A_hat = US_ / rowsum[:,:,None]

# compute V_hat
V_hat = D @ v
output_offset = -(M_offset @ D @ v)
((A - (US_ @ D - M_offset @ D)) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5

tensor([5.4222e-05, 1.7222e-04, 5.0951e-05, 4.6841e-05, 4.4781e-05, 1.0790e-04,
        1.3410e-04, 8.2946e-05, 5.1122e-05, 6.8018e-05, 7.2581e-05, 7.6963e-05])

In [262]:
torch.linalg.matrix_rank(A), torch.linalg.matrix_rank(US), torch.linalg.matrix_rank(US_), torch.linalg.matrix_rank(torch.log(US_))

(tensor([298, 205,  95, 102,  73, 136, 143,  74, 110,  72, 115,  69]),
 tensor([298, 205,  95, 102,  73, 136, 143,  74, 110,  72, 115,  69]),
 tensor([242, 122,  72,  71,  54,  83,  94,  39,  71,  41,  78,  37]),
 tensor([246, 131,  74,  75,  57,  89, 101,  42,  78,  44,  81,  42]))

In [251]:
# solve for K_hat - may find different solutions to same q, US_ due to randomness (even after seeding)
K_hat = torch.linalg.lstsq(q, torch.log(US_)).solution.transpose(-1, -2)
((torch.log(US_) - q @ K_hat.transpose(-1, -2)) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5  # overdetermined system when l > d (always)

tensor([27.4845, 23.1989, 27.0541, 20.6320, 20.7098, 17.6928, 17.4312, 17.5956,
        21.0654, 18.8198, 17.8044, 15.4002])

In [252]:
# recompute A prime
A_hat_p = torch.softmax(q @ K_hat.transpose(-1, -2), dim=-1)
((A_hat - A_hat_p) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5

tensor([0.0185, 0.0164, 0.0173, 0.0165, 0.0169, 0.0137, 0.0141, 0.0072, 0.0178,
        0.0141, 0.0164, 0.0087])

In [256]:
# reconstruct original attention matrix
A_p = (A_hat @ D) * rowsum[:,:,None] - M_offset @ D
A_pp = (A_hat_p @ D) * rowsum[:,:,None] - M_offset @ D
(
    ((A - A_p) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5,
    ((A - A_pp) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5,
    ((A_p - A_pp) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5,
)

(tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003,
         0.0003, 0.0003, 0.0003]),
 tensor([14.7886, 13.1058, 13.8717, 13.2007, 13.4920, 10.9209, 11.2509,  5.7252,
         14.2186, 11.2537, 13.0806,  6.9801]),
 tensor([14.7886, 13.1058, 13.8717, 13.2007, 13.4920, 10.9209, 11.2509,  5.7252,
         14.2186, 11.2537, 13.0806,  6.9801]))

In [258]:
# compute new outputs
out_p = (A_hat @ V_hat) * rowsum[:,:,None] + output_offset
out_pp = (A_hat_p @ V_hat) * rowsum[:,:,None] + output_offset
(
    (((out - out_p) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1),
    (((out - out_pp) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1),
    (((out_p - out_pp) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1),
)

(tensor([3.3264e-05, 7.3146e-05, 6.9990e-05, 7.1059e-05, 7.3563e-05, 1.1071e-04,
         9.4446e-05, 1.5011e-04, 1.1413e-04, 1.5240e-04, 1.7317e-04, 1.6475e-04]),
 tensor([0.9898, 2.4709, 1.7239, 1.4547, 1.9608, 2.1588, 2.6071, 1.1689, 3.4130,
         3.1450, 5.5249, 2.1639]),
 tensor([0.9898, 2.4709, 1.7239, 1.4547, 1.9608, 2.1588, 2.6071, 1.1689, 3.4130,
         3.1450, 5.5249, 2.1639]))

In [259]:
(((out) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1)

tensor([2.1358, 3.6509, 2.5654, 3.0447, 3.2976, 3.0497, 2.9470, 0.9034, 5.4433,
        4.7898, 8.5961, 2.4983])

### Take 2

In [362]:
attn_mask = torch.tril(torch.ones(A.shape[-2:]).type(torch.bool))
A = torch.softmax(q @ k.transpose(-1, -2), dim=-1)
#A[...,~attn_mask] = 0
out = A @ v

In [351]:
# decompose A into USD
U, s, D = torch.linalg.svd(A)
S = torch.diag_embed(s, dim1=1, dim2=2)

# take highest singular value vectors
U = U[:,:,:e]
S = S[:,:e,:e]
D = D[:,:e,:]

((A - U @ S @ D) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5

tensor([1.9609e-05, 2.1393e-05, 3.9042e-05, 2.2519e-05, 2.5699e-05, 2.6788e-05,
        3.5920e-05, 4.3862e-05, 2.0114e-05, 2.6681e-05, 2.4320e-05, 3.9945e-05])

In [352]:
# compute pseudo A
US = U @ S

# compute log offset
offset = 1.3
US_ = US + offset
M_offset = torch.ones_like(US_) * offset
# M_offset = torch.linalg.lstsq(US_, US).solution  # doesnt work

# make rows of pseudo A sum to 1
rowsum = US_.sum(dim=-1)
A_hat = US_ / rowsum[:,:,None]

# compute V_hat
V_hat = D @ v
output_offset = -(M_offset @ D @ v)
((A - (US_ @ D - M_offset @ D)) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5

tensor([4.0832e-05, 1.2165e-04, 4.4922e-05, 3.6562e-05, 3.6650e-05, 8.1338e-05,
        9.3193e-05, 6.2302e-05, 3.5899e-05, 4.8376e-05, 5.6828e-05, 5.9304e-05])

In [353]:
torch.linalg.matrix_rank(A), torch.linalg.matrix_rank(US), torch.linalg.matrix_rank(US_), torch.linalg.matrix_rank(torch.log(US_))

(tensor([298, 205,  95, 102,  73, 136, 143,  74, 110,  72, 115,  69]),
 tensor([298, 205,  95, 102,  73, 136, 143,  74, 110,  72, 115,  69]),
 tensor([246, 131,  73,  74,  56,  88, 102,  42,  79,  44,  80,  42]),
 tensor([260, 156,  84,  87,  63, 107, 116,  54,  88,  54,  92,  52]))

In [354]:
K_hat = torch.linalg.lstsq(q, torch.log(US_)).solution.transpose(-1, -2)
print(((torch.log(US_) - q @ K_hat.transpose(-1, -2)) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5)

tensor([17.6073, 15.0833, 17.5408, 13.9145, 15.2176, 13.6837, 14.0984,  9.5459,
        16.0747, 15.3922, 14.6954,  9.6832])


### Aside - try directly optimizging k, v

In [312]:
from tqdm.auto import tqdm

In [None]:
K_hat = torch.nn.Parameter(torch.zeros_like(k))
V_hat = torch.nn.Parameter(torch.zeros_like(v))
K_hat.data.normal_()
V_hat.data.normal_()

n_steps = 10000
optim = torch.optim.SGD([K_hat, V_hat], lr=0.2)
for i in (pbar := tqdm(range(n_steps))):
    loss = (((torch.softmax(q @ K_hat.transpose(-2, -1), dim=-1) @ V_hat) - out) ** 2).sum(dim=-1).mean()
    pbar.set_description(f"loss: {loss.item()}")
    loss.backward()
    optim.step()
    optim.zero_grad()

In [331]:
(((out - (torch.softmax(q @ K_hat.transpose(-2, -1), dim=-1) @ V_hat)) ** 2).sum(dim=-1) ** 0.5).mean()

tensor(0.4077, grad_fn=<MeanBackward0>)

In [334]:
with torch.no_grad():
    print(((((out - (torch.softmax(q @ K_hat.transpose(-2, -1), dim=-1) @ V_hat)) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1)))

tensor([1.2162, 0.5490, 0.0219, 0.0767, 0.0740, 0.3159, 0.4229, 0.1210, 0.2291,
        0.6276, 0.1438, 1.0947])


### Try solving for original V

In [372]:
A = torch.softmax(q @ k.transpose(-1, -2), dim=-1)
#A[...,~attn_mask] = 0
out = A @ v

In [381]:
v__ = torch.linalg.solve(A, out).solution

_LinAlgError: torch.linalg.solve: (Batch element 4): The solver failed because the input matrix is singular.

In [377]:
print((((out - A @ v__) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1))

tensor([2.1046, 3.6361, 2.5274, 3.0096, 3.0790, 3.1536, 2.8280, 0.6648, 5.4319,
        4.6320, 8.5844, 1.8041])


In [361]:
print((((out) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1))

tensor([0.9714, 0.4000, 0.0071, 0.0422, 0.0416, 0.2649, 0.3864, 0.3217, 0.1348,
        0.7272, 0.0907, 1.4685])


In [376]:
V_hat = torch.nn.Parameter(torch.zeros_like(v))
V_hat.data.normal_()

n_steps = 10000
optim = torch.optim.SGD([V_hat], lr=0.2)
for i in (pbar := tqdm(range(n_steps))):
    loss = (((A @ V_hat) - out) ** 2).sum(dim=-1).mean()
    pbar.set_description(f"loss: {loss.item()}")
    loss.backward()
    optim.step()
    optim.zero_grad()

  0%|          | 0/10000 [00:00<?, ?it/s]

In [380]:
print((((out - A @ V_hat) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1).detach())

tensor([1.8189, 0.9440, 0.4758, 0.5052, 0.4074, 0.7642, 0.8565, 0.3587, 0.4798,
        0.4630, 0.8575, 0.4125])


### Try solving for V_hat after within-softmax decomposition

### Continue decomp

In [355]:
# solve for K_hat - may find different solutions to same q, US_ due to randomness (even after seeding)
# overdetermined system when l > d (always) - the larger the offset the worse the solution
# optimal offset around 1.3 (balance pos/neg post-log values)
xs = np.arange(0.1, 0.5, 0.1)
for x in xs:
    K_hat = torch.linalg.lstsq(q, torch.log(US + 1 + x)).solution.transpose(-1, -2)
    print(((torch.log(US + 1 + x) - q @ K_hat.transpose(-1, -2)) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5)

tensor([21.6346, 17.7610, 22.2047, 16.8955, 20.1448, 18.2865, 19.0530, 11.2748,
        21.1698, 21.8627, 19.5552, 12.5265])
tensor([18.3572, 15.5771, 18.5209, 14.5647, 16.5345, 15.0256, 15.5906,  9.5470,
        17.5020, 17.3379, 16.1797, 10.2331])
tensor([17.6073, 15.0833, 17.5408, 13.9145, 15.2176, 13.6837, 14.0984,  9.5459,
        16.0747, 15.3922, 14.6954,  9.6832])
tensor([18.0905, 15.5026, 17.8915, 14.1389, 15.0216, 13.3313, 13.6019, 10.3054,
        15.7617, 14.7003, 14.1734,  9.9381])


In [298]:
# recompute A prime
A_hat_p = torch.softmax(q @ K_hat.transpose(-1, -2), dim=-1)
((A_hat - A_hat_p) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5

tensor([0.0236, 0.0096, 0.0018, 0.0065, 0.0039, 0.0104, 0.0137, 0.0076, 0.0074,
        0.0147, 0.0082, 0.0105])

In [299]:
# reconstruct original attention matrix
A_p = (A_hat @ D) * rowsum[:,:,None] - M_offset @ D
A_pp = (A_hat_p @ D) * rowsum[:,:,None] - M_offset @ D
(
    ((A - A_p) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5,
    ((A - A_pp) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5,
    ((A_p - A_pp) ** 2).sum(dim=-1).sum(dim=-1) ** 0.5,
)

(tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002]),
 tensor([12.2878,  4.9760,  0.9185,  3.3771,  2.0346,  5.3996,  7.1118,  3.9255,
          3.8505,  7.6451,  4.2630,  5.4825]),
 tensor([12.2878,  4.9760,  0.9185,  3.3771,  2.0346,  5.3996,  7.1118,  3.9255,
          3.8505,  7.6451,  4.2630,  5.4825]))

In [300]:
# compute new outputs
out_p = (A_hat @ V_hat) * rowsum[:,:,None] + output_offset
out_pp = (A_hat_p @ V_hat) * rowsum[:,:,None] + output_offset
(
    (((out - out_p) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1),
    (((out - out_pp) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1),
    (((out_p - out_pp) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1),
)

(tensor([2.7750e-05, 4.2608e-05, 9.9115e-05, 4.0284e-05, 4.5616e-05, 7.7316e-05,
         6.4753e-05, 8.6857e-05, 6.2070e-05, 9.1452e-05, 1.0988e-04, 1.2083e-04]),
 tensor([0.9489, 0.8771, 0.0253, 0.1572, 0.2246, 0.3211, 0.7626, 0.1703, 0.4477,
         1.1651, 0.2970, 1.0703]),
 tensor([0.9489, 0.8771, 0.0253, 0.1572, 0.2246, 0.3211, 0.7626, 0.1703, 0.4477,
         1.1651, 0.2970, 1.0703]))

In [301]:
(((out) ** 2).sum(dim=-1) ** 0.5).mean(dim=-1)

tensor([0.9714, 0.4000, 0.0071, 0.0422, 0.0416, 0.2649, 0.3864, 0.3217, 0.1348,
        0.7272, 0.0907, 1.4685])

### Potential issues
- Recomputing softmax for A_pp without original scaling factor of A_p
- Scaling in alpha and inverse alpha at different points where they dont cancel
- All-zero query vectors will make solving for K_hat ineffective
- Q x K_hat = A_hat is overdetermined for K_hat

### Collect Q, K, V from WMT

In [1]:
from tqdm.auto import tqdm
from datasets import load_dataset
import numpy as  np
import torch
import torch.nn as nn
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import transformers_drop_in as drop_in
import tensor_util as tu
from config import CONFIG

In [2]:
CONFIG.do_consolidate = True
CONFIG.consolidate_ratio = 0.5
CONFIG.context_length = 400
CONFIG.consolidate_length = 200
CONFIG.temperature = 0.1
CONFIG.fix_prune_rate = True

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
dataset = load_dataset('wikitext', 'wikitext-103-v1')
split = dataset['train']

In [4]:
l = np.load('token_length.npy')
all([len(tok) > 512 for tok in tokenizer.batch_encode_plus([split[int(i)]['text'] for i in np.where(l > 512)[0][:20]])['input_ids']])

True

In [5]:
model = GPT2LMHeadModel.from_pretrained('gpt2').to(CONFIG.device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2AttentionDropIn(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
batch_size = 1
indices, = np.where(l > CONFIG.context_length)
batch_iter = iter(np.array_split(np.random.choice(indices, len(indices), replace=False), len(indices) // batch_size))

In [7]:
n_layer = 12
drop_in.GLOBALS.outputs = {
    'q': [[] for _ in range(n_layer)],
    'k': [[] for _ in range(n_layer)],
    'v': [[] for _ in range(n_layer)],
}

def record_attn(layer_idx, query, key, value, unnormalized_attn, final_attn, attn_output, attn_mask):
    drop_in.GLOBALS.outputs['q'][layer_idx] += [query.cpu()]
    drop_in.GLOBALS.outputs['k'][layer_idx] += [key.cpu()]
    drop_in.GLOBALS.outputs['v'][layer_idx] += [value.cpu()]


def no_op(query, key, value, attn_weights):
    pass

drop_in.record_attn_vars = record_attn

In [8]:
n_sample = 20
n_layer = 12
cols = 4
rows = n_layer // cols
rank_by_layer = [[] for _ in range(n_layer)]
with torch.no_grad():
    for i in tqdm(range(n_sample)):
        batch = next(batch_iter)
        model_input = {name: t.to(CONFIG.device) for name, t in tokenizer.batch_encode_plus(split[batch]['text'],
                                                                                         return_tensors="pt",
                                                                                         truncation=True,
                                                                                         max_length=CONFIG.context_length).items()}
        model(**model_input)

  0%|          | 0/20 [00:00<?, ?it/s]

In [14]:
qkv = {
    'q': [None for _ in range(n_layer)],
    'k': [None for _ in range(n_layer)],
    'v': [None for _ in range(n_layer)],
}
for m in ['q', 'k', 'v']:
    for i in range(n_layer):
        qkv[m][i] = torch.cat(drop_in.GLOBALS.outputs[m][i], dim=0)
    qkv[m] = np.stack(qkv[m], axis=0)
np.savez('qkv.npz', **qkv)

### SVD from eigen decomposition

In [238]:
U, s, D = torch.linalg.svd(A)
val, vec = torch.linalg.eig(A @ A.transpose(-1, -2))
val_, vec_ = torch.linalg.eig(A.transpose(-1, -2) @ A)

In [239]:
s[0][:10]**2, val[0][:10], val_[0][:10]

(tensor([16.6662, 15.0797, 14.3649, 13.5416, 12.0955, 10.6551,  8.5561,  7.0162,
          6.6288,  5.9499]),
 tensor([16.6662+0.j, 15.0797+0.j, 14.3649+0.j, 13.5416+0.j, 12.0956+0.j, 10.6551+0.j,
          8.5561+0.j,  7.0162+0.j,  6.6288+0.j,  5.9499+0.j]),
 tensor([16.6662+0.j, 15.0797+0.j, 14.3649+0.j, 13.5416+0.j, 12.0956+0.j, 10.6551+0.j,
          8.5561+0.j,  7.0162+0.j,  6.6288+0.j,  5.9500+0.j]))

In [243]:
U[0][:5,:5], vec[0][:5,:4]

(tensor([[ 1.1310e-03, -1.3311e-04,  5.6899e-04, -1.0748e-03,  2.4993e-03],
         [ 2.5153e-02,  2.7871e-03,  1.7440e-02, -7.4674e-04,  1.2350e-01],
         [ 1.9880e-03, -2.5313e-05,  3.4438e-04, -9.2611e-05,  3.1835e-03],
         [ 7.2101e-03, -1.0660e-02,  1.3801e-03, -1.3957e-03,  5.7525e-03],
         [ 6.1774e-03, -3.2167e-03,  5.2106e-04, -1.2708e-03,  7.1740e-03]]),
 tensor([[-1.1311e-03+0.j, -1.3302e-04+0.j, -5.6890e-04+0.j,  1.0750e-03+0.j],
         [-2.5153e-02+0.j,  2.7869e-03+0.j, -1.7440e-02+0.j,  7.4698e-04+0.j],
         [-1.9880e-03+0.j, -2.5305e-05+0.j, -3.4451e-04+0.j,  9.2737e-05+0.j],
         [-7.2097e-03+0.j, -1.0660e-02+0.j, -1.3800e-03+0.j,  1.3958e-03+0.j],
         [-6.1774e-03+0.j, -3.2167e-03+0.j, -5.2106e-04+0.j,  1.2708e-03+0.j]]))