# DistilBERT Testing Notebook

In [1]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [6]:
import os
import sys
sys.path.append(os.path.abspath('..'))

from constants import (
    TARGET_SPARSITY_LOW, TARGET_SPARSITY_MID, TARGET_SPARSITY_HIGH,
    BATCH_SIZE_CNN, BATCH_SIZE_VIT, BATCH_SIZE_LLM,
    EPOCHS_SMALL_MODEL, EPOCHS_LARGE_MODEL, EPOCHS_VIT
)
from utils import get_device, get_num_workers, load_weights, print_statistics
from unstructured_pruning import check_model_sparsity, check_sparsity_distribution
from trainer import TrainingArguments, Trainer
from bacp import BaCPTrainingArguments, BaCPTrainer

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
os.environ["HF_DATASETS_CACHE"] = "/dbfs/hf_datasets"
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

In [7]:
DEVICE = get_device()
NUM_WORKERS = get_num_workers()
print("Using device:", DEVICE)
print("Using", NUM_WORKERS, "workers")

Using device: cuda
Using 288 workers


# DistilBERT

In [8]:
MODEL_NAME = "distilbert-base-uncased"
MODEL_TASK = "wikitext2"
TRAIN = True

## Baseline Accuracies

In [9]:
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-5),
    scheduler_type='linear_with_warmup',
    epochs=50,
    learning_type="baseline",
    db=False,
)
trainer = Trainer(training_args=training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Optimizer type w/ learning rate: (adamw, 5e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] Linear scheduler initialized with warmup steps: 355 and total steps: 3550
[TRAINER] Pruning not initialized
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.0


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.37%
  Perplexity:   5.601

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.0000 (0.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        baseline
  Batch Size:           64
  Learning Rate:        5e-05
  Optimizer:            adamw
  Epochs:               50

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





## Pruning Accuracies

### Magnitude Prune

In [29]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     10.14%
  Perplexity:   5466.271

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        2e-05
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





{'input_ids': tensor([[  101,   102,   101,  ...,   103, 15046,  1999],
        [ 2526,  2004,   103,  ...,  1999,   103,  8595],
        [ 1010,  2040, 22073,  ...,  1996,  3166,  2043],
        ...,
        [ 2244,   103,  1012,  ...,  1011,  1030,  9152],
        [ 1030,  1011,  1030,  ...,  2743, 17318,  2247],
        [ 2049,  2670,  2918,  ...,   103,  6445,  2094]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  2198,  -100,  -100],
        [ -100,  -100,  1000,  ...,  -100, 20325,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        ...,
        [ -100,  1019,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  1996,  -100,  -100]])}

In [31]:
for data in trainer.testloader:
    print(data)
    break

{'input_ids': tensor([[  101,   102,   101,  ...,   103, 15046,  1999],
        [ 2526,  2004,   103,  ...,  1999,   103,  8595],
        [ 1010,  2040, 22073,  ...,  1996,  3166,  2043],
        ...,
        [ 2244,   103,  1012,  ...,  1011,  1030,  9152],
        [ 1030,  1011,  1030,  ...,  2743, 17318,  2247],
        [ 2049,  2670,  2918,  ...,   103,  6445,  2094]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  2198,  -100,  -100],
        [ -100,  -100,  1000,  ...,  -100, 20325,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        ...,
        [ -100,  1019,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  1996,  -100,  -100]])}


In [32]:
for data in trainer.valloader:
    print(data)
    break

{'input_ids': tensor([[  101,   102,   101,  ...,  2137,   103,  1012],
        [ 1996,   103,  2427,  ...,  5046, 26158,  2015],
        [  103,   103,   103,  ...,  1010,  1998,  2087],
        ...,
        [ 2863,   103, 23593,  ...,   103,  1005,  1055],
        [ 2190,  2434,   103,  ...,  2806,  6186,  2299],
        [  103,  6232, 10057,  ...,  1010,   103,   103]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  2271,  -100],
        [ -100,  2048,  -100,  ...,  -100,  -100,  -100],
        [ 5939, 22747, 24876,  ...,  -100,  -100,  2087],
        ...,
        [ -100,  1010,  -100,  ...,  7579,  -100,  -100],
        [ -100,  -100,  2573,  ...,  -100,  -100,  -100],
        [ 2008,  -100,  -100,  ...,  -100,  1996,  2299]])}


In [34]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.97
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.97_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     8.65%
  Perplexity:   7899.856

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        2e-05
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [24]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/magnitude_pruning/0.99/run_3.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.483.



                                                                                                   

Recovery epoch [1/10]: Avg Loss: 1.5665 | Avg Accuracy: 64.28 | Model Sparsity: 0.4831
Avg Perplexity: 5.570

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5645 | Avg Accuracy: 64.02 | Model Sparsity: 0.4831
Avg Perplexity: 5.635



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5632 | Avg Accuracy: 64.18 | Model Sparsity: 0.4831
Avg Perplexity: 5.586



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5591 | Avg Accuracy: 64.44 | Model Sparsity: 0.4831
Avg Perplexity: 5.592

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5621 | Avg Accuracy: 63.86 | Model Sparsity: 0.4831
Avg Perplexity: 5.750



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5628 | Avg Accuracy: 64.36 | Model Sparsity: 0.4831
Avg Perplexity: 5.621



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5597 | Avg Accuracy: 64.21 | Model Sparsity: 0.4831
Avg Perplexity: 5.712



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5639 | Avg Accuracy: 63.86 | Model Sparsity: 0.4831
Avg Perplexity: 5.791



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5614 | Avg Accuracy: 63.93 | Model Sparsity: 0.4831
Avg Perplexity: 5.724



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5547 | Avg Accuracy: 64.57 | Model Sparsity: 0.4831
Avg Perplexity: 5.529

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5577 | Avg Accuracy: 64.00 | Model Sparsity: 0.4831
Avg Perplexity: 5.734



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.776.



                                                                                                   

Recovery epoch [2/10]: Avg Loss: 1.5490 | Avg Accuracy: 63.99 | Model Sparsity: 0.7762
Avg Perplexity: 5.625

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5613 | Avg Accuracy: 64.05 | Model Sparsity: 0.7762
Avg Perplexity: 5.638

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5550 | Avg Accuracy: 64.42 | Model Sparsity: 0.7762
Avg Perplexity: 5.522

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5487 | Avg Accuracy: 64.01 | Model Sparsity: 0.7762
Avg Perplexity: 5.731



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5459 | Avg Accuracy: 64.10 | Model Sparsity: 0.7762
Avg Perplexity: 5.645



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5496 | Avg Accuracy: 64.00 | Model Sparsity: 0.7762
Avg Perplexity: 5.633



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5594 | Avg Accuracy: 64.79 | Model Sparsity: 0.7762
Avg Perplexity: 5.524

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5510 | Avg Accuracy: 64.33 | Model Sparsity: 0.7762
Avg Perplexity: 5.679



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5488 | Avg Accuracy: 64.34 | Model Sparsity: 0.7762
Avg Perplexity: 5.548



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5542 | Avg Accuracy: 64.08 | Model Sparsity: 0.7762
Avg Perplexity: 5.632



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5471 | Avg Accuracy: 64.59 | Model Sparsity: 0.7762
Avg Perplexity: 5.476



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.927.



                                                                                                   

Recovery epoch [3/10]: Avg Loss: 1.5574 | Avg Accuracy: 64.11 | Model Sparsity: 0.9266
Avg Perplexity: 5.733

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5461 | Avg Accuracy: 63.90 | Model Sparsity: 0.9266
Avg Perplexity: 5.689



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5509 | Avg Accuracy: 63.73 | Model Sparsity: 0.9266
Avg Perplexity: 5.734



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5413 | Avg Accuracy: 64.25 | Model Sparsity: 0.9266
Avg Perplexity: 5.692

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5504 | Avg Accuracy: 64.15 | Model Sparsity: 0.9266
Avg Perplexity: 5.715



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5522 | Avg Accuracy: 64.42 | Model Sparsity: 0.9266
Avg Perplexity: 5.694

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5515 | Avg Accuracy: 63.34 | Model Sparsity: 0.9266
Avg Perplexity: 5.779



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5466 | Avg Accuracy: 63.86 | Model Sparsity: 0.9266
Avg Perplexity: 5.682



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5430 | Avg Accuracy: 64.36 | Model Sparsity: 0.9266
Avg Perplexity: 5.545



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5392 | Avg Accuracy: 63.72 | Model Sparsity: 0.9266
Avg Perplexity: 5.858



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5447 | Avg Accuracy: 63.98 | Model Sparsity: 0.9266
Avg Perplexity: 5.654



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.982.



                                                                                                   

Recovery epoch [4/10]: Avg Loss: 1.5405 | Avg Accuracy: 64.17 | Model Sparsity: 0.9821
Avg Perplexity: 5.727

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5396 | Avg Accuracy: 64.39 | Model Sparsity: 0.9821
Avg Perplexity: 5.651

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5388 | Avg Accuracy: 64.07 | Model Sparsity: 0.9821
Avg Perplexity: 5.642



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5321 | Avg Accuracy: 64.19 | Model Sparsity: 0.9821
Avg Perplexity: 5.681



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5337 | Avg Accuracy: 64.49 | Model Sparsity: 0.9821
Avg Perplexity: 5.654

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5403 | Avg Accuracy: 64.47 | Model Sparsity: 0.9821
Avg Perplexity: 5.484



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5327 | Avg Accuracy: 64.85 | Model Sparsity: 0.9821
Avg Perplexity: 5.513

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5407 | Avg Accuracy: 63.93 | Model Sparsity: 0.9821
Avg Perplexity: 5.687



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5373 | Avg Accuracy: 64.30 | Model Sparsity: 0.9821
Avg Perplexity: 5.616



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5355 | Avg Accuracy: 64.54 | Model Sparsity: 0.9821
Avg Perplexity: 5.521



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5361 | Avg Accuracy: 64.43 | Model Sparsity: 0.9821
Avg Perplexity: 5.628



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.990.



                                                                                                 

Recovery epoch [5/10]: Avg Loss: 1.5308 | Avg Accuracy: 63.95 | Model Sparsity: 0.99
Avg Perplexity: 5.748

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 1.5329 | Avg Accuracy: 64.48 | Model Sparsity: 0.99
Avg Perplexity: 5.639

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 1.5301 | Avg Accuracy: 64.47 | Model Sparsity: 0.99
Avg Perplexity: 5.639



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 1.5249 | Avg Accuracy: 64.25 | Model Sparsity: 0.99
Avg Perplexity: 5.606



                                                                                                  

Recovery epoch [4/10]: Avg Loss: 1.5243 | Avg Accuracy: 63.77 | Model Sparsity: 0.99
Avg Perplexity: 5.722



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 1.5271 | Avg Accuracy: 63.90 | Model Sparsity: 0.99
Avg Perplexity: 5.771



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 1.5218 | Avg Accuracy: 64.38 | Model Sparsity: 0.99
Avg Perplexity: 5.626



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 1.5284 | Avg Accuracy: 64.37 | Model Sparsity: 0.99
Avg Perplexity: 5.589



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 1.5233 | Avg Accuracy: 64.11 | Model Sparsity: 0.99
Avg Perplexity: 5.573



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 1.5195 | Avg Accuracy: 63.85 | Model Sparsity: 0.99
Avg Perplexity: 5.724



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 1.5156 | Avg Accuracy: 64.24 | Model Sparsity: 0.99
Avg Perplexity: 5.508

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.68%
  Perplexity:   5.528

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        2e-05
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





### SNIP-it Prune

In [25]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.95_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/snip_pruning/0.95/run_2.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.464.



                                                                                                   

Recovery epoch [1/10]: Avg Loss: 1.5661 | Avg Accuracy: 64.13 | Model Sparsity: 0.4636
Avg Perplexity: 5.748

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5696 | Avg Accuracy: 63.57 | Model Sparsity: 0.4636
Avg Perplexity: 5.869



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5643 | Avg Accuracy: 64.10 | Model Sparsity: 0.4636
Avg Perplexity: 5.725



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5592 | Avg Accuracy: 64.18 | Model Sparsity: 0.4636
Avg Perplexity: 5.645

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5635 | Avg Accuracy: 63.84 | Model Sparsity: 0.4636
Avg Perplexity: 5.712



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5509 | Avg Accuracy: 63.95 | Model Sparsity: 0.4636
Avg Perplexity: 5.646



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5627 | Avg Accuracy: 64.44 | Model Sparsity: 0.4636
Avg Perplexity: 5.683

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5633 | Avg Accuracy: 63.67 | Model Sparsity: 0.4636
Avg Perplexity: 5.781



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5605 | Avg Accuracy: 64.35 | Model Sparsity: 0.4636
Avg Perplexity: 5.589



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5562 | Avg Accuracy: 63.85 | Model Sparsity: 0.4636
Avg Perplexity: 5.780



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5671 | Avg Accuracy: 63.80 | Model Sparsity: 0.4636
Avg Perplexity: 5.830



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.745.



                                                                                                   

Recovery epoch [2/10]: Avg Loss: 1.5654 | Avg Accuracy: 63.98 | Model Sparsity: 0.7448
Avg Perplexity: 5.745

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5544 | Avg Accuracy: 63.77 | Model Sparsity: 0.7448
Avg Perplexity: 5.635



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5608 | Avg Accuracy: 64.23 | Model Sparsity: 0.7448
Avg Perplexity: 5.711

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5616 | Avg Accuracy: 64.14 | Model Sparsity: 0.7448
Avg Perplexity: 5.610



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5519 | Avg Accuracy: 63.87 | Model Sparsity: 0.7448
Avg Perplexity: 5.712



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5497 | Avg Accuracy: 64.01 | Model Sparsity: 0.7448
Avg Perplexity: 5.750



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5576 | Avg Accuracy: 64.27 | Model Sparsity: 0.7448
Avg Perplexity: 5.640

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5491 | Avg Accuracy: 64.45 | Model Sparsity: 0.7448
Avg Perplexity: 5.696

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5462 | Avg Accuracy: 64.27 | Model Sparsity: 0.7448
Avg Perplexity: 5.638



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5556 | Avg Accuracy: 64.01 | Model Sparsity: 0.7448
Avg Perplexity: 5.684



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5487 | Avg Accuracy: 64.09 | Model Sparsity: 0.7448
Avg Perplexity: 5.758



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.889.



                                                                                                   

Recovery epoch [3/10]: Avg Loss: 1.5485 | Avg Accuracy: 64.64 | Model Sparsity: 0.8892
Avg Perplexity: 5.489

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5519 | Avg Accuracy: 64.44 | Model Sparsity: 0.8892
Avg Perplexity: 5.534



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5484 | Avg Accuracy: 63.78 | Model Sparsity: 0.8892
Avg Perplexity: 5.727



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5462 | Avg Accuracy: 63.95 | Model Sparsity: 0.8892
Avg Perplexity: 5.695



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5443 | Avg Accuracy: 64.02 | Model Sparsity: 0.8892
Avg Perplexity: 5.597



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5437 | Avg Accuracy: 63.86 | Model Sparsity: 0.8892
Avg Perplexity: 5.663



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5439 | Avg Accuracy: 64.01 | Model Sparsity: 0.8892
Avg Perplexity: 5.711



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5452 | Avg Accuracy: 64.41 | Model Sparsity: 0.8892
Avg Perplexity: 5.567



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5413 | Avg Accuracy: 64.57 | Model Sparsity: 0.8892
Avg Perplexity: 5.541



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5440 | Avg Accuracy: 64.03 | Model Sparsity: 0.8892
Avg Perplexity: 5.712



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5413 | Avg Accuracy: 63.87 | Model Sparsity: 0.8892
Avg Perplexity: 5.678



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.942.



                                                                                                   

Recovery epoch [4/10]: Avg Loss: 1.5469 | Avg Accuracy: 64.26 | Model Sparsity: 0.9424
Avg Perplexity: 5.651

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5361 | Avg Accuracy: 64.43 | Model Sparsity: 0.9424
Avg Perplexity: 5.555

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5402 | Avg Accuracy: 64.25 | Model Sparsity: 0.9424
Avg Perplexity: 5.619



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5371 | Avg Accuracy: 63.87 | Model Sparsity: 0.9424
Avg Perplexity: 5.671



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5331 | Avg Accuracy: 64.29 | Model Sparsity: 0.9424
Avg Perplexity: 5.615



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5349 | Avg Accuracy: 64.41 | Model Sparsity: 0.9424
Avg Perplexity: 5.596



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5280 | Avg Accuracy: 63.86 | Model Sparsity: 0.9424
Avg Perplexity: 5.734



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5327 | Avg Accuracy: 64.35 | Model Sparsity: 0.9424
Avg Perplexity: 5.605



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5321 | Avg Accuracy: 64.86 | Model Sparsity: 0.9424
Avg Perplexity: 5.472

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5341 | Avg Accuracy: 64.16 | Model Sparsity: 0.9424
Avg Perplexity: 5.666



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5318 | Avg Accuracy: 64.12 | Model Sparsity: 0.9424
Avg Perplexity: 5.728



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.950.



                                                                                                 

Recovery epoch [5/10]: Avg Loss: 1.5308 | Avg Accuracy: 64.26 | Model Sparsity: 0.95
Avg Perplexity: 5.667

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 1.5314 | Avg Accuracy: 64.30 | Model Sparsity: 0.95
Avg Perplexity: 5.613

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 1.5328 | Avg Accuracy: 64.10 | Model Sparsity: 0.95
Avg Perplexity: 5.588



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 1.5289 | Avg Accuracy: 63.77 | Model Sparsity: 0.95
Avg Perplexity: 5.662



                                                                                                  

Recovery epoch [4/10]: Avg Loss: 1.5214 | Avg Accuracy: 64.45 | Model Sparsity: 0.95
Avg Perplexity: 5.572

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [5/10]: Avg Loss: 1.5215 | Avg Accuracy: 63.96 | Model Sparsity: 0.95
Avg Perplexity: 5.702



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 1.5248 | Avg Accuracy: 64.34 | Model Sparsity: 0.95
Avg Perplexity: 5.619



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 1.5204 | Avg Accuracy: 64.09 | Model Sparsity: 0.95
Avg Perplexity: 5.650



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 1.5291 | Avg Accuracy: 64.12 | Model Sparsity: 0.95
Avg Perplexity: 5.668



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 1.5275 | Avg Accuracy: 63.55 | Model Sparsity: 0.95
Avg Perplexity: 5.806



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 1.5330 | Avg Accuracy: 64.33 | Model Sparsity: 0.95
Avg Perplexity: 5.674

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.52%
  Perplexity:   5.533

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        2e-05
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [26]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.97
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.97_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/snip_pruning/0.97/run_2.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.473.



                                                                                                   

Recovery epoch [1/10]: Avg Loss: 1.5746 | Avg Accuracy: 63.63 | Model Sparsity: 0.4734
Avg Perplexity: 5.860

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5677 | Avg Accuracy: 64.15 | Model Sparsity: 0.4734
Avg Perplexity: 5.687

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5651 | Avg Accuracy: 64.18 | Model Sparsity: 0.4734
Avg Perplexity: 5.594

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5572 | Avg Accuracy: 64.32 | Model Sparsity: 0.4734
Avg Perplexity: 5.519

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5632 | Avg Accuracy: 64.30 | Model Sparsity: 0.4734
Avg Perplexity: 5.680



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5587 | Avg Accuracy: 63.78 | Model Sparsity: 0.4734
Avg Perplexity: 5.880



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5630 | Avg Accuracy: 64.07 | Model Sparsity: 0.4734
Avg Perplexity: 5.725



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5641 | Avg Accuracy: 64.42 | Model Sparsity: 0.4734
Avg Perplexity: 5.594

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5537 | Avg Accuracy: 64.44 | Model Sparsity: 0.4734
Avg Perplexity: 5.546

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5659 | Avg Accuracy: 63.98 | Model Sparsity: 0.4734
Avg Perplexity: 5.637



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5540 | Avg Accuracy: 63.97 | Model Sparsity: 0.4734
Avg Perplexity: 5.673



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.760.



                                                                                                   

Recovery epoch [2/10]: Avg Loss: 1.5588 | Avg Accuracy: 64.28 | Model Sparsity: 0.7605
Avg Perplexity: 5.661

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5583 | Avg Accuracy: 64.47 | Model Sparsity: 0.7605
Avg Perplexity: 5.667

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5550 | Avg Accuracy: 64.40 | Model Sparsity: 0.7605
Avg Perplexity: 5.558



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5553 | Avg Accuracy: 64.49 | Model Sparsity: 0.7605
Avg Perplexity: 5.566

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5543 | Avg Accuracy: 64.03 | Model Sparsity: 0.7605
Avg Perplexity: 5.707



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5500 | Avg Accuracy: 64.14 | Model Sparsity: 0.7605
Avg Perplexity: 5.567



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5557 | Avg Accuracy: 63.98 | Model Sparsity: 0.7605
Avg Perplexity: 5.712



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5502 | Avg Accuracy: 64.13 | Model Sparsity: 0.7605
Avg Perplexity: 5.616



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5479 | Avg Accuracy: 64.06 | Model Sparsity: 0.7605
Avg Perplexity: 5.729



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5528 | Avg Accuracy: 63.74 | Model Sparsity: 0.7605
Avg Perplexity: 5.691



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5489 | Avg Accuracy: 64.36 | Model Sparsity: 0.7605
Avg Perplexity: 5.671



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.908.



                                                                                                   

Recovery epoch [3/10]: Avg Loss: 1.5519 | Avg Accuracy: 64.24 | Model Sparsity: 0.9079
Avg Perplexity: 5.600

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5541 | Avg Accuracy: 63.95 | Model Sparsity: 0.9079
Avg Perplexity: 5.693



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5514 | Avg Accuracy: 64.32 | Model Sparsity: 0.9079
Avg Perplexity: 5.670

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5507 | Avg Accuracy: 64.40 | Model Sparsity: 0.9079
Avg Perplexity: 5.658

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5426 | Avg Accuracy: 64.11 | Model Sparsity: 0.9079
Avg Perplexity: 5.685



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5407 | Avg Accuracy: 63.85 | Model Sparsity: 0.9079
Avg Perplexity: 5.722



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5486 | Avg Accuracy: 64.67 | Model Sparsity: 0.9079
Avg Perplexity: 5.623

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5460 | Avg Accuracy: 64.51 | Model Sparsity: 0.9079
Avg Perplexity: 5.634



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5346 | Avg Accuracy: 64.28 | Model Sparsity: 0.9079
Avg Perplexity: 5.655



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5422 | Avg Accuracy: 64.15 | Model Sparsity: 0.9079
Avg Perplexity: 5.721



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5430 | Avg Accuracy: 64.46 | Model Sparsity: 0.9079
Avg Perplexity: 5.513



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.962.



                                                                                                   

Recovery epoch [4/10]: Avg Loss: 1.5403 | Avg Accuracy: 64.46 | Model Sparsity: 0.9622
Avg Perplexity: 5.610

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5393 | Avg Accuracy: 64.36 | Model Sparsity: 0.9622
Avg Perplexity: 5.543



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5346 | Avg Accuracy: 64.65 | Model Sparsity: 0.9622
Avg Perplexity: 5.562

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5341 | Avg Accuracy: 64.36 | Model Sparsity: 0.9622
Avg Perplexity: 5.638



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5360 | Avg Accuracy: 63.78 | Model Sparsity: 0.9622
Avg Perplexity: 5.761



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5334 | Avg Accuracy: 64.03 | Model Sparsity: 0.9622
Avg Perplexity: 5.624



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5369 | Avg Accuracy: 64.36 | Model Sparsity: 0.9622
Avg Perplexity: 5.578



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5344 | Avg Accuracy: 64.14 | Model Sparsity: 0.9622
Avg Perplexity: 5.635



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5317 | Avg Accuracy: 64.09 | Model Sparsity: 0.9622
Avg Perplexity: 5.612



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5362 | Avg Accuracy: 64.63 | Model Sparsity: 0.9622
Avg Perplexity: 5.579



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5393 | Avg Accuracy: 64.26 | Model Sparsity: 0.9622
Avg Perplexity: 5.644



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.970.



                                                                                                 

Recovery epoch [5/10]: Avg Loss: 1.5309 | Avg Accuracy: 64.17 | Model Sparsity: 0.97
Avg Perplexity: 5.678

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 1.5326 | Avg Accuracy: 64.40 | Model Sparsity: 0.97
Avg Perplexity: 5.632

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 1.5264 | Avg Accuracy: 63.85 | Model Sparsity: 0.97
Avg Perplexity: 5.645



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 1.5259 | Avg Accuracy: 64.49 | Model Sparsity: 0.97
Avg Perplexity: 5.542

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [4/10]: Avg Loss: 1.5281 | Avg Accuracy: 63.79 | Model Sparsity: 0.97
Avg Perplexity: 5.780



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 1.5312 | Avg Accuracy: 64.00 | Model Sparsity: 0.97
Avg Perplexity: 5.727



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 1.5296 | Avg Accuracy: 64.07 | Model Sparsity: 0.97
Avg Perplexity: 5.686



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 1.5226 | Avg Accuracy: 64.46 | Model Sparsity: 0.97
Avg Perplexity: 5.542



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 1.5289 | Avg Accuracy: 64.27 | Model Sparsity: 0.97
Avg Perplexity: 5.612



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 1.5189 | Avg Accuracy: 64.33 | Model Sparsity: 0.97
Avg Perplexity: 5.601



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 1.5367 | Avg Accuracy: 63.69 | Model Sparsity: 0.97
Avg Perplexity: 5.740

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.76%
  Perplexity:   5.579

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        2e-05
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [27]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.99_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/snip_pruning/0.99/run_2.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.483.



                                                                                                   

Recovery epoch [1/10]: Avg Loss: 1.5648 | Avg Accuracy: 63.60 | Model Sparsity: 0.4831
Avg Perplexity: 5.803

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5668 | Avg Accuracy: 64.59 | Model Sparsity: 0.4831
Avg Perplexity: 5.585

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5657 | Avg Accuracy: 64.04 | Model Sparsity: 0.4831
Avg Perplexity: 5.637



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5740 | Avg Accuracy: 63.88 | Model Sparsity: 0.4831
Avg Perplexity: 5.853



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5634 | Avg Accuracy: 64.08 | Model Sparsity: 0.4831
Avg Perplexity: 5.739



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5598 | Avg Accuracy: 64.52 | Model Sparsity: 0.4831
Avg Perplexity: 5.545



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5667 | Avg Accuracy: 63.79 | Model Sparsity: 0.4831
Avg Perplexity: 5.755



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5539 | Avg Accuracy: 64.18 | Model Sparsity: 0.4831
Avg Perplexity: 5.701



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5732 | Avg Accuracy: 63.52 | Model Sparsity: 0.4831
Avg Perplexity: 5.846



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5652 | Avg Accuracy: 63.59 | Model Sparsity: 0.4831
Avg Perplexity: 5.706



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5600 | Avg Accuracy: 64.63 | Model Sparsity: 0.4831
Avg Perplexity: 5.489

[TRAINER] weights saved!


Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.776.



                                                                                                   

Recovery epoch [2/10]: Avg Loss: 1.5604 | Avg Accuracy: 63.96 | Model Sparsity: 0.7762
Avg Perplexity: 5.720

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5562 | Avg Accuracy: 64.17 | Model Sparsity: 0.7762
Avg Perplexity: 5.661

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5569 | Avg Accuracy: 64.11 | Model Sparsity: 0.7762
Avg Perplexity: 5.640



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5571 | Avg Accuracy: 64.28 | Model Sparsity: 0.7762
Avg Perplexity: 5.624

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5488 | Avg Accuracy: 64.07 | Model Sparsity: 0.7762
Avg Perplexity: 5.677



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5558 | Avg Accuracy: 64.10 | Model Sparsity: 0.7762
Avg Perplexity: 5.672



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5558 | Avg Accuracy: 63.78 | Model Sparsity: 0.7762
Avg Perplexity: 5.636



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5551 | Avg Accuracy: 63.96 | Model Sparsity: 0.7762
Avg Perplexity: 5.744



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5535 | Avg Accuracy: 64.21 | Model Sparsity: 0.7762
Avg Perplexity: 5.679



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5557 | Avg Accuracy: 63.92 | Model Sparsity: 0.7762
Avg Perplexity: 5.808



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5559 | Avg Accuracy: 64.16 | Model Sparsity: 0.7762
Avg Perplexity: 5.704



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.927.



                                                                                                   

Recovery epoch [3/10]: Avg Loss: 1.5516 | Avg Accuracy: 64.57 | Model Sparsity: 0.9266
Avg Perplexity: 5.501

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5511 | Avg Accuracy: 64.15 | Model Sparsity: 0.9266
Avg Perplexity: 5.627



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5477 | Avg Accuracy: 64.12 | Model Sparsity: 0.9266
Avg Perplexity: 5.644



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5621 | Avg Accuracy: 64.27 | Model Sparsity: 0.9266
Avg Perplexity: 5.563



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5508 | Avg Accuracy: 63.80 | Model Sparsity: 0.9266
Avg Perplexity: 5.715



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5490 | Avg Accuracy: 63.64 | Model Sparsity: 0.9266
Avg Perplexity: 5.857



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5460 | Avg Accuracy: 64.46 | Model Sparsity: 0.9266
Avg Perplexity: 5.653



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5501 | Avg Accuracy: 63.55 | Model Sparsity: 0.9266
Avg Perplexity: 5.693



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5463 | Avg Accuracy: 63.65 | Model Sparsity: 0.9266
Avg Perplexity: 5.838



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5426 | Avg Accuracy: 63.78 | Model Sparsity: 0.9266
Avg Perplexity: 5.725



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5440 | Avg Accuracy: 64.10 | Model Sparsity: 0.9266
Avg Perplexity: 5.691



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.982.



                                                                                                   

Recovery epoch [4/10]: Avg Loss: 1.5475 | Avg Accuracy: 63.96 | Model Sparsity: 0.9821
Avg Perplexity: 5.765

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.5383 | Avg Accuracy: 63.88 | Model Sparsity: 0.9821
Avg Perplexity: 5.749



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.5371 | Avg Accuracy: 64.20 | Model Sparsity: 0.9821
Avg Perplexity: 5.658

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.5353 | Avg Accuracy: 64.25 | Model Sparsity: 0.9821
Avg Perplexity: 5.650

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.5310 | Avg Accuracy: 64.20 | Model Sparsity: 0.9821
Avg Perplexity: 5.678



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.5384 | Avg Accuracy: 64.54 | Model Sparsity: 0.9821
Avg Perplexity: 5.478

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.5346 | Avg Accuracy: 64.10 | Model Sparsity: 0.9821
Avg Perplexity: 5.687



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.5359 | Avg Accuracy: 63.96 | Model Sparsity: 0.9821
Avg Perplexity: 5.697



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.5341 | Avg Accuracy: 63.92 | Model Sparsity: 0.9821
Avg Perplexity: 5.675



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.5296 | Avg Accuracy: 64.18 | Model Sparsity: 0.9821
Avg Perplexity: 5.544



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.5281 | Avg Accuracy: 64.42 | Model Sparsity: 0.9821
Avg Perplexity: 5.545



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.990.



                                                                                                 

Recovery epoch [5/10]: Avg Loss: 1.5367 | Avg Accuracy: 64.03 | Model Sparsity: 0.99
Avg Perplexity: 5.647

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 1.5264 | Avg Accuracy: 64.14 | Model Sparsity: 0.99
Avg Perplexity: 5.642

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 1.5312 | Avg Accuracy: 63.77 | Model Sparsity: 0.99
Avg Perplexity: 5.709



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 1.5259 | Avg Accuracy: 64.19 | Model Sparsity: 0.99
Avg Perplexity: 5.638

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [4/10]: Avg Loss: 1.5260 | Avg Accuracy: 64.31 | Model Sparsity: 0.99
Avg Perplexity: 5.658

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [5/10]: Avg Loss: 1.5337 | Avg Accuracy: 63.85 | Model Sparsity: 0.99
Avg Perplexity: 5.729



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 1.5351 | Avg Accuracy: 64.17 | Model Sparsity: 0.99
Avg Perplexity: 5.661



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 1.5282 | Avg Accuracy: 64.48 | Model Sparsity: 0.99
Avg Perplexity: 5.567

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [8/10]: Avg Loss: 1.5332 | Avg Accuracy: 63.99 | Model Sparsity: 0.99
Avg Perplexity: 5.799



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 1.5320 | Avg Accuracy: 64.21 | Model Sparsity: 0.99
Avg Perplexity: 5.556



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 1.5250 | Avg Accuracy: 64.21 | Model Sparsity: 0.99
Avg Perplexity: 5.706

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.78%
  Perplexity:   5.558

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        2e-05
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





### WandA Prune

In [28]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 2e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized


AttributeError: 'NoneType' object has no attribute 'dim'

In [0]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 2e-5),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

## BaCP Accuracies

### Magnitude Pruning

In [0]:
trained_model_path = f"/dbfs/research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if True:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if True:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [0]:
trained_model_path = f"/dbfs/research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [0]:
trained_model_path = f"/dbfs/research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

### Movement Prune

In [0]:
model_name = "distilbert-base-uncased"
model_task = "wikitext2"
trained_model_path = f"/dbfs/research/{model_name}/{model_task}/{model_name}_{model_task}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=model_name,
    model_task=model_task,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="movement_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if True:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if True:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [0]:
model_name = "distilbert-base-uncased"
model_task = "wikitext2"
trained_model_path = f"/dbfs/research/{model_name}/{model_task}/{model_name}_{model_task}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=model_name,
    model_task=model_task,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="movement_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if True:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if True:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [0]:
model_name = "distilbert-base-uncased"
model_task = "wikitext2"
trained_model_path = f"/dbfs/research/{model_name}/{model_task}/{model_name}_{model_task}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=model_name,
    model_task=model_task,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="movement_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if False:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if True:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

### WandA Prune

In [0]:
trained_model_path = f"/dbfs/research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [0]:
trained_model_path = f"/dbfs/research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)

In [0]:
trained_model_path = f"/dbfs/research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()

# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type='adamw',
    learning_rate=1e-3,
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print(metrics)