In [45]:
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification, BertTokenizer
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from datasets import load_dataset

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [46]:
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2", num_labels=2)
model.to("cuda")

for name, module in model.named_modules():
    print(name)



bert
bert.embeddings
bert.embeddings.word_embeddings
bert.embeddings.position_embeddings
bert.embeddings.token_type_embeddings
bert.embeddings.LayerNorm
bert.embeddings.dropout
bert.encoder
bert.encoder.layer
bert.encoder.layer.0
bert.encoder.layer.0.attention
bert.encoder.layer.0.attention.self
bert.encoder.layer.0.attention.self.query
bert.encoder.layer.0.attention.self.key
bert.encoder.layer.0.attention.self.value
bert.encoder.layer.0.attention.self.dropout
bert.encoder.layer.0.attention.output
bert.encoder.layer.0.attention.output.dense
bert.encoder.layer.0.attention.output.LayerNorm
bert.encoder.layer.0.attention.output.dropout
bert.encoder.layer.0.intermediate
bert.encoder.layer.0.intermediate.dense
bert.encoder.layer.0.intermediate.intermediate_act_fn
bert.encoder.layer.0.output
bert.encoder.layer.0.output.dense
bert.encoder.layer.0.output.LayerNorm
bert.encoder.layer.0.output.dropout
bert.encoder.layer.1
bert.encoder.layer.1.attention
bert.encoder.layer.1.attention.self
bert.e

In [40]:
BATCH_SIZE = 32
# Load the GLUE SST-2 dataset
# Load SST-2 validation data
dataset = load_dataset("glue", "sst2", split="validation")
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")

def tokenize(batch):
    return tokenizer(batch['sentence'], padding='max_length', truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

In [48]:
def collect_activations(model, dataloader, layer):
    activations = []

    def hook_fn(module, input, output):
        activations.append(output.detach())

    # Register a forward hook on the layer to capture activations
    handle = layer.register_forward_hook(hook_fn)

    # Pass the data through the model to collect activations
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(next(model.parameters()).device)
            model(inputs)

    handle.remove()  # Clean up the hook
    X = torch.cat(activations, dim=0)
    if X.ndim > 2:
        X = X.permute(2, 0, 1, *range(3, X.ndim))
        X = X.reshape(X.shape[0], -1)
    else:
        X = X.T
    return X


def low_rank_approximation(X, W, rank):
    """
    Perform data-aware low-rank approximation of W using the DRONE algorithm.
    Args:
        X (torch.Tensor): Matrix of input activations (shape: [d1, n]).
        W (torch.Tensor): Original weight matrix (shape: [d2, d1]).
        rank (int): Target rank for low-rank approximation.
    Returns:
        U_star (torch.Tensor): Left matrix of the low-rank approximation.
        V_star (torch.Tensor): Right matrix of the low-rank approximation.
    """

    print("W_shape", W.shape)
    print("X_shape", X.shape)

    if W.ndim < 2:
        print("Invalid weight matrix with ndim < 2, skipping this layer.")
        return W, False
    elif W.shape[1] < rank:
        print("Rank of weight matrix already lower than k value, skipping this layer.")
        return W, False
    elif W.shape[1] != X.shape[0]:
        print("Invalid weight matrix dimensions, skipping this layer.")
        return W, False

    # Step 1: SVD of W -> W = U_W S_W V_W^T
    U_W, S_W, V_W_T = torch.linalg.svd(W, full_matrices=False)
    U_W_r = U_W[:, :rank]
    S_W_r = S_W[:rank]
    V_W = V_W_T.T
    V_W_r = V_W[:, :rank]

    # Step 2: SVD of X -> X = U_X S_X V_X^T

    t = X.shape[0]
    U_X, S_X, V_X_T = torch.linalg.svd(X, full_matrices=False)
    U_X_t = U_X[:, :t]
    S_X_t = S_X[:t]
    V_X = V_X_T.T
    V_X_t = V_X[:, :t]

    # Step 3: Compute Z = S_W_r V_W_r^T U_X_t S_X_t

    z1 = torch.diag(S_W_r) @ V_W_r.T
    z2 = z1 @ U_X_t
    Z = z2 @ torch.diag(S_X_t)
    #Z = torch.diag(S_W_r) @ V_W_r @ U_X_t.T @ torch.diag(S_X_t)

    # Step 4: Truncated SVD of Z to get Z_k = U_Z,k S_Z,k V_Z,k^T
    U_Z, S_Z, V_Z_T = torch.linalg.svd(Z, full_matrices=False)
    U_Z_k = U_Z[:, :rank]
    S_Z_k = S_Z[:rank]
    V_Z_k = V_Z_T[:rank, :]

    # Step 5: Construct U_star and V_star
    U_star = W @ V_W_r @ torch.diag(1 / S_W_r) @ U_Z_k @ torch.diag(S_Z_k)
    V_star = V_Z_k @ torch.diag(1 / S_X_t) @ U_X_t.T

    # Approximate W with the low-rank matrices
    W_approx = U_star @ V_star
    print("W_approx", W_approx.shape)
    W_approx = W_approx.reshape(W.shape)

    return W_approx, True

def low_rank_approximation_svd(X,W, rank):
    """
    Compress a given layer using only SVD with a specified rank.
    Args:
        layer (torch.nn.Module): The layer to compress.
        rank (int): Target rank for the low-rank approximation.
    Returns:
        torch.Tensor: Compressed weight matrix.
    """
    if W.ndim < 2:
        print("Invalid weight matrix with ndim < 2, skipping this layer.")
        return W, False
    elif W.shape[1] < rank:
        print("Rank of weight matrix already lower than target rank, skipping this layer.")
        return W, False

    # Perform SVD on the weight matrix W -> W = U S V^T
    U, S, V_T = torch.linalg.svd(W, full_matrices=False)
    U_r = U[:, :rank]
    S_r = S[:rank]
    V_r = V_T[:rank, :]

    # Construct low-rank approximation: W_approx = U_r * S_r * V_r^T
    W_approx = U_r @ torch.diag(S_r) @ V_r
    return W_approx, True

def compress_layer(model, dataloader, layer_name, rank):
    """
    Compresses a specified layer in the model using data-aware low-rank approximation.
    """
    # Access layer by name
    layer = dict(model.named_modules())[layer_name]

    # Check if layer has weights
    if hasattr(layer, 'weight'):
        W = layer.weight.data

        # Extract activations for input distribution
        X = collect_activations(model, dataloader, layer)

        # Calculate the low-rank approximation of the layer's weight matrix
        W_approx, successful_compressed = low_rank_approximation_svd(X, W, rank)

        # Update layer's weight with the compressed approximation
        if successful_compressed == True:
          layer.weight.data = W_approx

    return model, successful_compressed
# Example usage:
# compressed_model = compress_layer(model, 'fc1', dataloader, rank=10)

In [49]:
def overall_low_rank_approximation(model, dataloader, k_values, allowed_loss_ratio, layer_names):
    """
    Apply Algorithm 2 to compress a model layer-by-layer using Algorithm 1.
    """
    original_loss = evaluate_model_loss(model, dataloader)
    total_layers = len(layer_names)

    for i, layer_name in enumerate(layer_names):
        layer = dict(model.named_modules())[layer_name]
        original_weights = {}  # Store original weights for potential restoration
        print("------------------")
        print("Layer Name: ", layer_name)
        # Iterate over submodules and store original weights
        for name, submodule in layer.named_modules():
            if hasattr(submodule, 'weight'):
                original_weights[name] = submodule.weight.data.clone()  # Store a copy

        current_k = k_values[i]  # Predefined rank for this layer

        model, result = compress_layer(model, dataloader, layer_name, current_k)
        if result == False:
            continue
        # Evaluate new model loss after compression
        new_loss = evaluate_model_loss(model, dataloader)
        print("original_loss", original_loss)
        print("new_loss", new_loss)
        if new_loss / original_loss < 1 + allowed_loss_ratio:
            print(f"Layer {layer_name} compressed with rank {current_k} under allowed loss ratio.")
        else:
            print(f"Layer {layer_name} compression with rank {current_k} exceeded allowed loss. Skipping.")
            # Restore original weights if compression exceeded allowed loss ratio
            for name, submodule in layer.named_modules():
                if hasattr(submodule, 'weight') and name in original_weights:
                    submodule.weight.data = original_weights[name]

    return model

def evaluate_model_loss(model, dataloader):
    """
    Computes the average loss of the model on the SST-2 dataset.
    """
    model.eval()  # Set model to evaluation mode
    loss_fn = torch.nn.CrossEntropyLoss()  # Define the loss function
    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():  # No need to compute gradients
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['label'].to(model.device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            # Get model outputs
            num_batches += 1

    # Calculate average loss
    avg_loss = total_loss / num_batches
    return avg_loss

def evaluate_model_accuracy(model, dataloader):
    """
    Computes the accuracy of the model on the SST-2 dataset.
    """
    model.eval()  # Set model to evaluation mode
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():  # No need to compute gradients
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['label'].to(model.device)

            # Get model outputs (logits)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Predicted class (0 or 1)
            predictions = torch.argmax(logits, dim=-1)

            # Update counts for accuracy calculation
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    return accuracy

In [50]:
acc = evaluate_model_accuracy(model, dataloader)
print(f"Average accuracy of the model: {acc}")

Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

Average accuracy of the model: 0.9243119266055045


In [51]:

layer_names = [name for name, module in model.named_modules() if hasattr(module, 'weight')]
k_values = [200] * 1000 # Chosen rank k for each layer
allowed_loss_ratio = 0.5  # 50% allowed loss increase

# Run the overall low-rank approximation
compressed_model = overall_low_rank_approximation(model, dataloader, k_values, allowed_loss_ratio, layer_names)

acc = evaluate_model_accuracy(compressed_model, dataloader)
print(f"Average accuracy of the compressed model: {acc}")

Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

------------------
Layer Name:  bert.embeddings.word_embeddings


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.7814795023628643
Layer bert.embeddings.word_embeddings compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.embeddings.position_embeddings


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2749784386583737
Layer bert.embeddings.position_embeddings compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.embeddings.token_type_embeddings


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2749783900965537
Layer bert.embeddings.token_type_embeddings compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.embeddings.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.0.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2726041973967637
Layer bert.encoder.layer.0.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.0.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2729104801214167
Layer bert.encoder.layer.0.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.0.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2709803729584174
Layer bert.encoder.layer.0.attention.self.value compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.0.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.26953241335494177
Layer bert.encoder.layer.0.attention.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.0.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.0.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.26370313464264783
Layer bert.encoder.layer.0.intermediate.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.0.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.26254055788740516
Layer bert.encoder.layer.0.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.0.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.1.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.264237355440855
Layer bert.encoder.layer.1.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.1.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2639762883606766
Layer bert.encoder.layer.1.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.1.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.26519322894247516
Layer bert.encoder.layer.1.attention.self.value compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.1.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.26524933114913957
Layer bert.encoder.layer.1.attention.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.1.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.1.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2809971946158579
Layer bert.encoder.layer.1.intermediate.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.1.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.27999476955405306
Layer bert.encoder.layer.1.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.1.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.2.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.28623584125723156
Layer bert.encoder.layer.2.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.2.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.2882212611979672
Layer bert.encoder.layer.2.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.2.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.29722578584083487
Layer bert.encoder.layer.2.attention.self.value compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.2.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.29518560266920496
Layer bert.encoder.layer.2.attention.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.2.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.2.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.3037859776190349
Layer bert.encoder.layer.2.intermediate.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.2.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.308797079537596
Layer bert.encoder.layer.2.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.2.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.3.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.31098499095865656
Layer bert.encoder.layer.3.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.3.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.31172905010836466
Layer bert.encoder.layer.3.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.3.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.3211311490408012
Layer bert.encoder.layer.3.attention.self.value compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.3.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.32560085505247116
Layer bert.encoder.layer.3.attention.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.3.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.3.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.37773120775818825
Layer bert.encoder.layer.3.intermediate.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.3.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4165953999119146
Layer bert.encoder.layer.3.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.3.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.4.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.37487488985061646
Layer bert.encoder.layer.4.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.4.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.3775365762412548
Layer bert.encoder.layer.4.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.4.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.415558647364378
Layer bert.encoder.layer.4.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.4.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.37211353172148975
Layer bert.encoder.layer.4.attention.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.4.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.4.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.44846236758998465
Layer bert.encoder.layer.4.intermediate.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.4.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4011628127523831
Layer bert.encoder.layer.4.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.4.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.5.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4057040124067238
Layer bert.encoder.layer.5.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.5.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4113132895103523
Layer bert.encoder.layer.5.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.5.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4091731049120426
Layer bert.encoder.layer.5.attention.self.value compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.5.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41681705149156706
Layer bert.encoder.layer.5.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.5.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.5.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4121189144040857
Layer bert.encoder.layer.5.intermediate.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.5.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4155726709536144
Layer bert.encoder.layer.5.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.5.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.6.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41211013389485224
Layer bert.encoder.layer.6.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.6.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41157487726637293
Layer bert.encoder.layer.6.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.6.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4257144518196583
Layer bert.encoder.layer.6.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.6.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4199960455298424
Layer bert.encoder.layer.6.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.6.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.6.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4421086992536272
Layer bert.encoder.layer.6.intermediate.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.6.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.43319650047591757
Layer bert.encoder.layer.6.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.6.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.7.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4126806487994535
Layer bert.encoder.layer.7.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.7.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4140085939850126
Layer bert.encoder.layer.7.attention.self.key compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.7.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4187536319451673
Layer bert.encoder.layer.7.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.7.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4148987739213875
Layer bert.encoder.layer.7.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.7.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.7.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4397796296647617
Layer bert.encoder.layer.7.intermediate.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.7.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4120995263968195
Layer bert.encoder.layer.7.output.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.7.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.8.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41475886745112284
Layer bert.encoder.layer.8.attention.self.query compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.8.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41314889543822836
Layer bert.encoder.layer.8.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.8.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4199012645653316
Layer bert.encoder.layer.8.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.8.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.42057754365461214
Layer bert.encoder.layer.8.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.8.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.8.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.42430873747382847
Layer bert.encoder.layer.8.intermediate.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.8.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4220518650753157
Layer bert.encoder.layer.8.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.8.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.9.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4129624121955463
Layer bert.encoder.layer.9.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.9.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.415976528610502
Layer bert.encoder.layer.9.attention.self.key compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.9.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41518708212035044
Layer bert.encoder.layer.9.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.9.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4145363707627569
Layer bert.encoder.layer.9.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.9.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.9.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.44969739126307623
Layer bert.encoder.layer.9.intermediate.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.9.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.42296232283115387
Layer bert.encoder.layer.9.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.9.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.10.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41345842874475885
Layer bert.encoder.layer.10.attention.self.query compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.10.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41437909645693644
Layer bert.encoder.layer.10.attention.self.key compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.10.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4181487315467426
Layer bert.encoder.layer.10.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.10.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4158976429275104
Layer bert.encoder.layer.10.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.10.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.10.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41239914457712856
Layer bert.encoder.layer.10.intermediate.dense compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.10.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41851973852940966
Layer bert.encoder.layer.10.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.10.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.11.attention.self.query


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41248541484986034
Layer bert.encoder.layer.11.attention.self.query compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.11.attention.self.key


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4130551942757198
Layer bert.encoder.layer.11.attention.self.key compressed with rank 200 under allowed loss ratio.
------------------
Layer Name:  bert.encoder.layer.11.attention.self.value


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.413931849279574
Layer bert.encoder.layer.11.attention.self.value compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.11.attention.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4156957417726517
Layer bert.encoder.layer.11.attention.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.11.attention.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.encoder.layer.11.intermediate.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4134462816374643
Layer bert.encoder.layer.11.intermediate.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.11.output.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.4142809786966869
Layer bert.encoder.layer.11.output.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  bert.encoder.layer.11.output.LayerNorm
Invalid weight matrix with ndim < 2, skipping this layer.
------------------
Layer Name:  bert.pooler.dense


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41355512397629873
Layer bert.pooler.dense compression with rank 200 exceeded allowed loss. Skipping.
------------------
Layer Name:  classifier


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

original_loss 0.27555497576083454
new_loss 0.41305520279066904
Layer classifier compressed with rank 200 under allowed loss ratio.


Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

Average accuracy of the compressed model: 0.8073394495412844
