In [1]:
import tensorly as tl
import tensorly.decomposition as tldec 
import numpy as np

In [2]:
tensor = np.random.random((10, 10, 10))

In [3]:
tensor

array([[[0.82207329, 0.9420222 , 0.27899614, 0.79602922, 0.98895746,
         0.562426  , 0.50371534, 0.84261989, 0.85836972, 0.28953628],
        [0.74579835, 0.32568329, 0.12986719, 0.98793715, 0.49842745,
         0.08300429, 0.10671885, 0.02543785, 0.41774241, 0.86382963],
        [0.74461656, 0.28881629, 0.25449231, 0.8542046 , 0.1624885 ,
         0.66368394, 0.57455713, 0.99727968, 0.44272303, 0.17041027],
        [0.98139399, 0.90907539, 0.93785202, 0.37127623, 0.08534596,
         0.71371593, 0.85760864, 0.63733075, 0.79489343, 0.65324378],
        [0.4092391 , 0.07387796, 0.47692241, 0.78028155, 0.16623292,
         0.44481688, 0.48066373, 0.19053311, 0.73895736, 0.16441765],
        [0.1330739 , 0.3957939 , 0.68110818, 0.69564948, 0.91199384,
         0.75916287, 0.17010984, 0.19735753, 0.09769318, 0.35308816],
        [0.87296452, 0.16286193, 0.36797862, 0.66689567, 0.63677524,
         0.62862899, 0.02004647, 0.09652285, 0.38015941, 0.3117757 ],
        [0.55271423, 0.4480

In [4]:
# decomposition of random tensor using rank 5
factors = tldec.parafac(tensor, rank=5)

In [5]:
factors

(weights, factors) : rank-5 CPTensor of shape (10, 10, 10)

In [6]:
factors[0].shape

(5,)

In [7]:
reconstructed_tensor = tl.kruskal_to_tensor(factors)

In [8]:
error = tl.norm(tensor - reconstructed_tensor)

In [9]:
error

8.07565625683765

In [11]:
#trying various ranks and checking Frobenius norm

factors = tldec.parafac(tensor, rank=3)
reconstructed_tensor = tl.kruskal_to_tensor(factors)
error = tl.norm(tensor - reconstructed_tensor)
error

8.624532328611688

In [12]:
#trying various ranks and checking Frobenius norm

factors = tldec.parafac(tensor, rank=10)
reconstructed_tensor = tl.kruskal_to_tensor(factors)
error = tl.norm(tensor - reconstructed_tensor)
error

6.611501367850849

In [13]:
#trying various ranks and checking Frobenius norm

factors = tldec.parafac(tensor, rank=50)
reconstructed_tensor = tl.kruskal_to_tensor(factors)
error = tl.norm(tensor - reconstructed_tensor)
error

0.015378341940359199

# Some Tests On Roberta Model

In [14]:
# load roberta model
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# print layers and their sizes
for name, param in model.named_parameters():
    print(name, param.size())


embeddings.word_embeddings.weight torch.Size([50265, 768])
embeddings.position_embeddings.weight torch.Size([514, 768])
embeddings.token_type_embeddings.weight torch.Size([1, 768])
embeddings.LayerNorm.weight torch.Size([768])
embeddings.LayerNorm.bias torch.Size([768])
encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias torch.Size([768])
encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias torch.Size([768])
encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias torch.Size([768])
encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
encoder.layer.0.inter

In [16]:
#selected_layers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
selected_layers = [8, 9]

In [17]:
weights_to_decompose = [] 
for layer in selected_layers:
    #taking these 4 right now because they are the same size
    weights_to_decompose.append(model.encoder.layer[layer].attention.self.query.weight.detach().numpy())
    weights_to_decompose.append(model.encoder.layer[layer].attention.self.key.weight.detach().numpy())
    weights_to_decompose.append(model.encoder.layer[layer].attention.self.value.weight.detach().numpy())
    weights_to_decompose.append(model.encoder.layer[layer].attention.output.dense.weight.detach().numpy())

In [18]:
tensor = np.stack(weights_to_decompose)

In [19]:
tensor.shape

(8, 768, 768)

In [20]:
rank = 5 # different ranks to try here 
factors = tldec.parafac(tensor, rank=5)

In [21]:
reconstructed_tensor_np = tl.kruskal_to_tensor(factors)
reconstructed_tensor = torch.from_numpy(reconstructed_tensor_np)

In [23]:
for i, layer in enumerate(selected_layers):
    index = i * 4
    model.encoder.layer[layer].attention.self.query.weight = torch.nn.Parameter(reconstructed_tensor[index].clone().detach())
    model.encoder.layer[layer].attention.self.key.weight = torch.nn.Parameter(reconstructed_tensor[index+1].clone().detach())
    model.encoder.layer[layer].attention.self.value.weight = torch.nn.Parameter(reconstructed_tensor[index+2].clone().detach())
    model.encoder.layer[layer].attention.output.dense.weight = torch.nn.Parameter(reconstructed_tensor[index+3].clone().detach())
