In [1]:
!pip install -q torch
!pip install -q datasets
!pip install -q sentencepiece

!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

!pip install scipy
!pip install protobuf
!pip install scipy
!pip install tqdm
# !python -m pip install optimum

[0m

In [2]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, \
    BitsAndBytesConfig
from peft import PeftModel

adapters_name = "model_outputs/dna_computing_model/"
model_name = "openlm-research/open_llama_7b"


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    quantization_config=nf4_config,
    output_attentions=True,
)
model = PeftModel.from_pretrained(
    base_model,
    adapters_name,
)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
def mem_check():
    print(torch.cuda.memory_summary(abbreviated=True))

mem_check()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   3748 MiB |   3942 MiB |  42133 MiB |  38384 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   3748 MiB |   3942 MiB |  42133 MiB |  38384 MiB |
|---------------------------------------------------------------------------|
| Requested memory      |   3726 MiB |   3919 MiB |  42074 MiB |  38347 MiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   4118 MiB |   4118 MiB |   4204 MiB |  88064 KiB |
|---------------------------------------------------------------

In [4]:
with open("DNA_computing.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(len(text))

16608


In [5]:
from tqdm import tqdm
import numpy as np
from IPython.core.display import display, HTML

def aggregate_attention(attn):
    '''Extract average attention vector'''
    avged = []
    for layer in attn:
        layer_attns = layer.squeeze(0)
        attns_per_head = layer_attns.mean(dim=0)
        vec = torch.concat((
            # We zero the first entry because it's what's called
            # null attention (https://aclanthology.org/W19-4808.pdf)
            torch.tensor([0.]),
            # usually there's only one item in attns_per_head but
            # on the first generation, there's a row for each token
            # in the prompt as well, so take [-1]
            attns_per_head[-1][1:].to("cpu"),
            # add zero for the final generated token, which never
            # gets any attention
            torch.tensor([0.]),
        ))
        avged.append(vec / vec.sum())
    return torch.stack(avged).mean(dim=0)
    
def precompute_attns(text):
    # right now I'm not worried about batch inference
    inputs = tokenizer.encode(text, return_tensors="pt")
    result = []
    for n in tqdm(range(2, len(inputs[0]))):
        outputs = model(
            inputs[:,:n],
            output_attentions=True,
            return_dict=True,
        )
        result.append(aggregate_attention(outputs.attentions))
    return result

# attns = precompute_attns("DNA computing is an emerging branch of unconventional computing which uses DNA, biochemistry, and molecular")
# attns

  from IPython.core.display import display, HTML


In [6]:
import gc
gc.collect()
torch.cuda.empty_cache()
gc.collect()

0

In [7]:
inputs = tokenizer.encode(text, return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (3606 > 2048). Running this sequence through the model will result in indexing errors


In [8]:
# import time
# start = time.time()
# temp_outputs = model(
#     inputs[:,:2048],
#     # output_attentions=True,
#     output_hidden_states=False,
#     past_key_values=None,
#     # use_cache=False,
# )
# end = time.time()
# print(end - start)
# temp_outputs.__dict__.keys()

In [9]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [10]:
temp_inputs = inputs[:,:2048]
token_embeds = model.model.embed_tokens(temp_inputs)
fee = model.model.layers[0](token_embeds, output_attentions=True)

# TODO: aggregate attention while going through the forward pass

In [11]:
fee[0].shape, fee[1].shape

(torch.Size([1, 2048, 4096]), torch.Size([1, 32, 2048, 2048]))

In [12]:
low_info_tokens = [tokenizer.encode(token)[1] for token in [",", "the", "of"]]
low_info_tokens.append(31876)

has_info = inputs[0] != torch.tensor(low_info_tokens).reshape(-1,1).all(0)
has_info[0] = 0 # null attention
has_info

tensor([False,  True,  True,  ...,  True,  True,  True])

In [13]:
import time



def agg_attn(inputs):
    n = inputs.shape[1]

    has_info = inputs[0] != torch.tensor(low_info_tokens).reshape(-1,1).all(0)
    has_info[0] = 0 # null attention
    has_info = has_info.to("cuda")
    
    prev_outputs = model.model.embed_tokens(inputs)
    avged = torch.zeros((n, n)).to("cuda")
    print(inputs.shape)
    for layer in model.model.layers:
        curr_outputs, curr_attns = layer(prev_outputs, output_attentions=True)
        attns_per_head = curr_attns[-1].mean(dim=0)
        # print(attns_per_head.shape, curr_attns[-1].shape)
        # assert False
        vec = attns_per_head * has_info
        # assert curr_attns.sum() != 0
        # assert (curr_attns != 0).any()
        # assert (vec != 0).any()
        avged += vec #/ vec.sum()
        prev_outputs = curr_outputs
    return avged
        
start = time.time()
avged = agg_attn(inputs[:,:2048])
end = time.time()
end - start

torch.Size([1, 2048])


0.24239850044250488

In [14]:
# (avged != 0).any()
max(avged.mean(0))

tensor(0.2166, device='cuda:0')

In [15]:
avged.mean(0).max()

tensor(0.2166, device='cuda:0')

In [16]:
inputs.shape

torch.Size([1, 3606])

In [17]:
torch.zeros((1, 10, 20, 20)).squeeze(0).mean(dim=0).shape

torch.Size([20, 20])

In [18]:
avged.max(), avged.mean(0).max()

(tensor(1.6005, device='cuda:0'), tensor(0.2166, device='cuda:0'))

In [19]:
avged.shape

torch.Size([2048, 2048])

In [20]:
mem_check()
import gc
gc.collect()
torch.cuda.empty_cache()
gc.collect()
mem_check()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   4052 MiB |   5428 MiB | 144629 MiB | 140576 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   4052 MiB |   5428 MiB | 144629 MiB | 140576 MiB |
|---------------------------------------------------------------------------|
| Requested memory      |   4030 MiB |   5406 MiB | 144539 MiB | 140508 MiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   6114 MiB |   6114 MiB |   6428 MiB | 321536 KiB |
|---------------------------------------------------------------

In [84]:
torch.concat([torch.zeros(10).reshape(1, -1).T, torch.ones(5).reshape(1, -1).T]).T

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]])

In [72]:
tokenizer.padding_side = "right"

In [109]:
from tqdm import tqdm
import numpy as np
from IPython.display import display, HTML

def heterogenous_stack(vecs):
    '''Pad vectors with zeros then stack'''
    max_length = max(v.shape[0] for v in vecs)
    return torch.stack([
        torch.concat((v, torch.zeros(max_length - v.shape[0])))
        for v in vecs
    ])

class AttentionVisualizer:

    def __init__(self, model, tokenizer, drop_low_info_tokens=True, copy_matt=False):
        self.low_info_tokens = [tokenizer.encode(token)[1]
            for token in [
                ",",
                "the",
                "of",
                "an",
                # "a",
                # "'",
                # "DNA", # test
            ]
        ]
        # 31876 is the token for an apostrophe '
        # If I pass in "'" to the encode function, it will be encoded as
        # 910, which is different from what I get if there are characters preceding
        # it
        self.low_info_tokens += [31876]
        self.model = model
        self.tokenizer = tokenizer
        self.drop_low_info_tokens = drop_low_info_tokens
        self.copy_matt = copy_matt

    def fit(self, text):
        """
        Don't have to worry about fine-tuning the model on text yet
        """
        self.raw_text = text
        self.tokens_encoded = self.tokenizer.encode(text, return_tensors="pt")#.to("mps")
        self.tokens_decoded = [self.tokenizer.decode(token) for token in self.tokens_encoded[0]]
        self._precompute_attns()

    def filtered_aggregate_attention(self, inputs):
        n = inputs.shape[1]
        # padding doesn't seem to be the culprit
        # inputs = torch.concat([inputs[0], 
        #                        torch.tensor([tokenizer.pad_token_id] * (2048 - n))])\
        #     .reshape(1,-1)
        has_info = torch.ones(n)
        if self.drop_low_info_tokens:
            has_info = (inputs[0] != torch.tensor(self.low_info_tokens).reshape(-1,1)).all(0)
        #print(has_info)
        #print("low info tokens excluded:", (~has_info).sum())
        has_info[0] = 0 # null attention
        # has_info[-1] = 0 # ignore the generated token
        # if has_info.shape[0] > 6:
        #     has_info[6] = 0 # test
        has_info = has_info.to("cuda")
        
        prev_outputs = model.model.embed_tokens(inputs)
        if self.copy_matt:
            avged = torch.zeros(n).to("cuda")
            for layer in model.model.layers:
                curr_outputs, curr_attns = layer(prev_outputs, output_attentions=True)
                attns_per_head = curr_attns[-1].mean(dim=0)
                vec = attns_per_head[-1] * has_info
                avged += vec / vec.sum()
                prev_outputs = curr_outputs
            return avged
        else:
            avged = torch.zeros((32, n, n)).to("cuda")
            for layer in model.model.layers:
                curr_outputs, curr_attns = layer(prev_outputs, output_attentions=True)
                # curr_attns is (1, 32, n, n)
                attns_per_head = curr_attns[-1].mean(dim=0)
                vec = attns_per_head * has_info
                avged += vec
            return avged.mean(dim=0).mean(dim=0)

    def _precompute_attns(self):
        n = len(self.tokens_encoded[0])
        self.precomputed_attentions = [
            self.filtered_aggregate_attention(
                self.tokens_encoded[:,:m]
            )
            for m in tqdm(range(1, n+1))
        ]
        return None

    def generate_attentions_for_selection(self, selected_start, n_selected=1):
        assert n_selected == 1, "Only one selection supported for now! (TODO)"
        # inputs = self.tokens_encoded[:, 0:(selected_start)]
        # print(self.tokens_decoded)
        # inputs = self.tokens_decoded[:, 0:selected_start]
        inputs = self.tokens_decoded[0:selected_start]
        print("generate_attns inputs.shape:", len(inputs))
        attn_m = heterogenous_stack(
            [
                torch.tensor([
                    1 if i == j else 0
                    for j in range(selected_start)
                ])
                for i in range(selected_start)
            ] +
            # list(map(aggregate_attention, outputs.attentions))
            # [self.filtered_aggregate_attention(inputs, outputs.attentions[0])]
            # [self.precomputed_attentions[selected_start+1]]
            [self.precomputed_attentions[selected_start-1].to("cpu")]
        )
        return attn_m

    def viz(self, selected_start, n_selected=1):
        """
        Visualize attention for a given selection
        """
        assert n_selected == 1, "Only one selection supported for now! (TODO)"
        attn_m = self.generate_attentions_for_selection(selected_start, n_selected)

        # Create a vector based on selected tokens
        # selected_vec = np.zeros(len(self.tokens_encoded[0]))
        print(attn_m.shape)
        selected_vec = np.zeros(attn_m.shape[0])
        selected_vec[selected_start:(selected_start + n_selected)] = 1

        # Calculate attention vector
        # attn_vec = np.dot(vec, attn_matrix.T)
        attn_vec = np.matmul(selected_vec, attn_m)
        min_val, max_val = min(attn_vec[1:]), max(attn_vec)
        attn_vec = (attn_vec - min_val) / (max_val - min_val)
        # Generate the HTML code for each token
        spans = []
        print(selected_vec.shape, attn_m.shape, attn_vec.shape, len(self.tokens_decoded))
        for i in range(0, len(self.tokens_decoded)):
            token = self.tokens_decoded[i]
            if token == "<s>":
                # think this is a special token. Possibly the start of the prompt?
                continue
            attn = 0
            selected = False
            if i < attn_vec.shape[0]:
                attn = attn_vec[i]
                #selected = False if i >= len(selected_vec) else selected_vec[i] == 1
            underline_style = "text-decoration: underline;" if i == selected_start else ""
            spans.append(f'<span style="background-color: rgba(255, 0, 0, {attn:.2f}); {underline_style}">{token}</span>')

        # Join the spans and display
        display(HTML(' '.join(spans)))

        # print(attn_matrix)
        print(attn_vec)

text_sub = "DNA computing is an emerging branch of unconventional computing which uses DNA, biochemistry, and molecular"

viz = AttentionVisualizer(model, tokenizer, drop_low_info_tokens=False, copy_matt=True)
viz.fit(text_sub)
viz.viz(13)



  0%|          | 0/20 [00:00<?, ?it/s][A[A

 10%|█         | 2/20 [00:00<00:01, 14.14it/s][A[A

 20%|██        | 4/20 [00:00<00:01, 13.07it/s][A[A

 30%|███       | 6/20 [00:00<00:01, 12.98it/s][A[A

 40%|████      | 8/20 [00:00<00:00, 12.99it/s][A[A

 50%|█████     | 10/20 [00:00<00:00, 12.93it/s][A[A

 60%|██████    | 12/20 [00:00<00:00, 12.85it/s][A[A

 70%|███████   | 14/20 [00:01<00:00, 12.76it/s][A[A

 80%|████████  | 16/20 [00:01<00:00, 12.69it/s][A[A

 90%|█████████ | 18/20 [00:01<00:00, 12.73it/s][A[A

100%|██████████| 20/20 [00:01<00:00, 12.87it/s][A[A

generate_attns inputs.shape: 13
torch.Size([14, 13])
(14,) torch.Size([14, 13]) torch.Size([13]) 20





tensor([-0.1362,  0.1380,  0.1292,  0.1275,  1.0000,  0.0733,  0.0595,  0.1178,
         0.0000,  0.1285,  0.3858,  0.6426,  0.9728], dtype=torch.float64)


In [92]:
viz = AttentionVisualizer(model, tokenizer, drop_low_info_tokens=False, copy_matt=True)
viz.fit(text_sub)
viz.viz(13)

  0%|          | 0/20 [00:00<?, ?it/s]

RuntimeError: output with shape [1] doesn't match the broadcast shape [2048]

In [23]:
from tqdm import tqdm
import numpy as np
from IPython.display import display, HTML

def heterogenous_stack(vecs):
    '''Pad vectors with zeros then stack'''
    max_length = max(v.shape[0] for v in vecs)
    return torch.stack([
        torch.concat((v, torch.zeros(max_length - v.shape[0])))
        for v in vecs
    ])

class AttentionVisualizer:

    def __init__(self, model, tokenizer, drop_low_info_tokens=True):
        self.low_info_tokens = [tokenizer.encode(token)[1]
            for token in [
                ",",
                "the",
                "of",
                # "'",
            ]
        ]
        # 31876 is the token for an apostrophe '
        # If I pass in "'" to the encode function, it will be encoded as
        # 910, which is different from what I get if there are characters preceding
        # it
        self.low_info_tokens.append(31876)
        self.model = model
        self.tokenizer = tokenizer
        self.drop_low_info_tokens = drop_low_info_tokens

    def fit(self, text):
        """
        Don't have to worry about fine-tuning the model on text yet
        """
        self.raw_text = text
        self.tokens_encoded = self.tokenizer.encode(text, return_tensors="pt")#.to("mps")
        self.tokens_decoded = [self.tokenizer.decode(token) for token in self.tokens_encoded[0]]
        self._precompute_attns()

    def filtered_aggregate_attention(self, inputs, attn, index):
        # TODO flag for low info tokens
        # print(inputs[0].shape)
        #print(inputs)
        # n_tokens = len(tokens[1:])
        # n_tokens = len(inputs[0][1:])
        n_tokens = len(inputs[0][1:])
        has_info = np.ones(n_tokens)
        if self.drop_low_info_tokens:
            # has_info = (inputs[0][1:] != torch.tensor(self.low_info_tokens).reshape(-1, 1)).all(0)
            has_info = (inputs[0][1:] != torch.tensor(self.low_info_tokens).reshape(-1, 1)).all(0)
        # print(n_tokens, has_info, has_info.shape, inputs.shape)
        avged = []
        for layer in attn:
            # print("layer.shape:", layer.shape)
            layer_attns = layer[index-1].squeeze(0)
            # print("layer_attns.shape:", layer_attns.shape)
            # layer_attns = layer.squeeze(0)
            attns_per_head = layer_attns.mean(dim=0)
            # print("attns_per_head.shape:", attns_per_head.shape)
            # print("has_info.shape:", has_info.shape)
            # assert attns_per_head[-1][1:].shape[0] == has_info.shape[0], (attns_per_head[-1][1:].shape, has_info.shape)
            # print((attns_per_head[-1][1:].to("cpu") * has_info).shape)
            # print(attns_per_head[-1][1:].shape) # 19, 20
            # foo = attns_per_head[-1][1:].to("cpu") * has_info
            vec = torch.concat((
                # We zero the first entry because it's what's called
                # null attention (https://aclanthology.org/W19-4808.pdf)
                torch.tensor([0.]),
                # usually there's only one item in attns_per_head but
                # on the first generation, there's a row for each token
                # in the prompt as well, so take [-1]
                attns_per_head[-1][1:] * has_info,
                # generated token gets 0 weight
                torch.tensor([0.]),
            ))
            avged.append(vec / vec.sum())
        return torch.stack(avged).mean(dim=0)

    def _precompute_attns(self):
        n = len(self.tokens_encoded[0])
        batched_ids = []
        attention_mask = []
        batched_ids = [
            list(self.tokens_encoded[0,:m]) + [tokenizer.pad_token_id] * (n - m)
            for m in range(1, n+1)
        ]
        attention_mask = [
            [1] * m + [0] * (n - m)
            for m in range(1, n+1)
        ]
        outputs = model(
            torch.tensor(batched_ids),
            attention_mask=torch.tensor(attention_mask),
            output_attentions=True,
            return_dict=True,
        )
        print("output_attentions.shape", len(outputs.attentions), outputs.attentions[0].shape)
        # TODO how to use filtered_aggregate_attention???
        self.precomputed_attentions = [
            self.filtered_aggregate_attention(
                self.tokens_encoded,
                outputs.attentions,
                m,
                # outputs.attentions[:,:m,:,:m,:m]
            )
            for m in range(2, n+1)
        ]
        return None

    def generate_attentions_for_selection(self, selected_start, n_selected=1):
        assert n_selected == 1, "Only one selection supported for now! (TODO)"
        # inputs = self.tokens_encoded[:, 0:(selected_start)]
        # print(self.tokens_decoded)
        # inputs = self.tokens_decoded[:, 0:selected_start]
        inputs = self.tokens_decoded[0:selected_start]
        print("generate_attns inputs.shape:", len(inputs))
        attn_m = heterogenous_stack(
            [
                torch.tensor([
                    1 if i == j else 0
                    for j, token in enumerate(inputs)
                ])
                for i, token in enumerate(inputs)
            ] +
            # list(map(aggregate_attention, outputs.attentions))
            # [self.filtered_aggregate_attention(inputs, outputs.attentions[0])]
            # [self.precomputed_attentions[selected_start+1]]
            [self.precomputed_attentions[selected_start-1]]
        )
        return attn_m

    def viz(self, selected_start, n_selected=1):
        """
        Visualize attention for a given selection
        """
        assert n_selected == 1, "Only one selection supported for now! (TODO)"
        attn_m = self.generate_attentions_for_selection(selected_start, n_selected)

        # Create a vector based on selected tokens
        # selected_vec = np.zeros(len(self.tokens_encoded[0]))
        print(attn_m.shape)
        selected_vec = np.zeros(attn_m.shape[0])
        selected_vec[selected_start:(selected_start + n_selected)] = 1

        # Calculate attention vector
        # attn_vec = np.dot(vec, attn_matrix.T)
        attn_vec = np.matmul(selected_vec, attn_m)
        min_val, max_val = min(attn_vec), max(attn_vec)
        attn_vec = (attn_vec - min_val) / (max_val - min_val)
        # Generate the HTML code for each token
        spans = []

        print(selected_vec.shape, attn_m.shape, attn_vec.shape, len(self.tokens_decoded))
        for i in range(1, len(self.tokens_decoded)):
            token = self.tokens_decoded[i]
            # if token == "<s>":
            #     # think this is a special token. Possibly the start of the prompt?
            #     continue
            attn = 0
            selected = False
            if i < attn_vec.shape[0]:
                attn = attn_vec[i]
                selected = False if i >= len(selected_vec) else selected_vec[i] == 1
            underline_style = "text-decoration: underline;" if selected else ""
            spans.append(f'<span style="background-color: rgba(255, 0, 0, {attn:.2f}); {underline_style}">{token}</span>')

        # Join the spans and display
        display(HTML(' '.join(spans)))

        # print(attn_matrix)
        print(attn_vec)

text_sub = "DNA computing is an emerging branch of unconventional computing which uses DNA, biochemistry, and molecular"

viz = AttentionVisualizer(model, tokenizer, drop_low_info_tokens=True)
viz.fit(text_sub)
viz.viz(13)

output_attentions.shape 32 torch.Size([20, 32, 20, 20])
generate_attns inputs.shape: 13
torch.Size([14, 21])
(14,) torch.Size([14, 21]) torch.Size([21]) 20


tensor([0.0000, 0.9586, 1.0000, 0.5960, 0.4947, 0.3948, 0.3723, 0.0000, 0.3422,
        0.3442, 0.4874, 0.4911, 0.5482, 0.8760, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], dtype=torch.float64)
