In [1]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))

from constants import (
    TARGET_SPARSITY_LOW, TARGET_SPARSITY_MID, TARGET_SPARSITY_HIGH,
    BATCH_SIZE_CNN, BATCH_SIZE_VIT, BATCH_SIZE_LLM,
    EPOCHS_SMALL_MODEL, EPOCHS_LARGE_MODEL, EPOCHS_VIT
)
from utils import get_device, get_num_workers, load_weights, print_statistics
from unstructured_pruning import check_model_sparsity, check_sparsity_distribution
from trainer import TrainingArguments, Trainer
from bacp import BaCPTrainingArguments, BaCPTrainer

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
os.environ["HF_DATASETS_CACHE"] = "/dbfs/hf_datasets"
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = get_device()
NUM_WORKERS = get_num_workers()
print("Using device:", DEVICE)
print("Using", NUM_WORKERS, "workers")

Using device: cuda
Using 288 workers


# DistilBERT

In [3]:
MODEL_NAME = "distilbert-base-uncased"
MODEL_TASK = "wikitext2"
TRAIN = True

## Baseline Accuracies

In [9]:
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-5),
    scheduler_type='linear_with_warmup',
    epochs=50,
    learning_type="baseline",
    db=False,
)
trainer = Trainer(training_args=training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Optimizer type w/ learning rate: (adamw, 5e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] Linear scheduler initialized with warmup steps: 355 and total steps: 3550
[TRAINER] Pruning not initialized
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.0


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.00%
  Perplexity:   5.743

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.0000 (0.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        baseline
  Batch Size:           64
  Learning Rate:        5e-05
  Optimizer:            adamw
  Epochs:               50

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





## Pruning Accuracies

### Magnitude Prune

In [21]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     47.98%
  Perplexity:   27.106

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [22]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.97
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.97_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/magnitude_pruning/0.97/run_1.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.473.



                                                                                                   

Training epoch [1/5]: Avg Loss: 2.1292 | Avg Accuracy: 58.90 | Model Sparsity: 0.4734
Avg Perplexity: 8.251

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.9564 | Avg Accuracy: 59.13 | Model Sparsity: 0.4734
Avg Perplexity: 8.067

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.9048 | Avg Accuracy: 59.17 | Model Sparsity: 0.4734
Avg Perplexity: 8.149

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.8746 | Avg Accuracy: 58.99 | Model Sparsity: 0.4734
Avg Perplexity: 8.308



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.8422 | Avg Accuracy: 58.37 | Model Sparsity: 0.4734
Avg Perplexity: 8.736



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.7938 | Avg Accuracy: 59.17 | Model Sparsity: 0.4734
Avg Perplexity: 8.371

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.7739 | Avg Accuracy: 58.76 | Model Sparsity: 0.4734
Avg Perplexity: 8.556



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.7454 | Avg Accuracy: 58.32 | Model Sparsity: 0.4734
Avg Perplexity: 8.956



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.7123 | Avg Accuracy: 58.50 | Model Sparsity: 0.4734
Avg Perplexity: 9.045



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.6896 | Avg Accuracy: 58.30 | Model Sparsity: 0.4734
Avg Perplexity: 8.998



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.6658 | Avg Accuracy: 58.22 | Model Sparsity: 0.4734
Avg Perplexity: 9.103



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.760.



                                                                                                   

Training epoch [2/5]: Avg Loss: 3.2906 | Avg Accuracy: 48.78 | Model Sparsity: 0.7605
Avg Perplexity: 18.337

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.4141 | Avg Accuracy: 50.61 | Model Sparsity: 0.7605
Avg Perplexity: 16.632

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.2494 | Avg Accuracy: 51.62 | Model Sparsity: 0.7605
Avg Perplexity: 15.259

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.1393 | Avg Accuracy: 52.62 | Model Sparsity: 0.7605
Avg Perplexity: 14.167

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.0646 | Avg Accuracy: 52.31 | Model Sparsity: 0.7605
Avg Perplexity: 14.457



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.0002 | Avg Accuracy: 53.06 | Model Sparsity: 0.7605
Avg Perplexity: 14.206

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.9597 | Avg Accuracy: 53.10 | Model Sparsity: 0.7605
Avg Perplexity: 14.163

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.9035 | Avg Accuracy: 53.45 | Model Sparsity: 0.7605
Avg Perplexity: 13.748

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8713 | Avg Accuracy: 53.79 | Model Sparsity: 0.7605
Avg Perplexity: 13.417

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.8275 | Avg Accuracy: 53.64 | Model Sparsity: 0.7605
Avg Perplexity: 13.905



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.8018 | Avg Accuracy: 53.92 | Model Sparsity: 0.7605
Avg Perplexity: 13.806

[TRAINER] weights saved!


Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.908.



                                                                                                   

Training epoch [3/5]: Avg Loss: 4.0304 | Avg Accuracy: 40.40 | Model Sparsity: 0.9079
Avg Perplexity: 39.793

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.0541 | Avg Accuracy: 43.33 | Model Sparsity: 0.9079
Avg Perplexity: 31.526

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.7855 | Avg Accuracy: 43.81 | Model Sparsity: 0.9079
Avg Perplexity: 29.001

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.6380 | Avg Accuracy: 45.89 | Model Sparsity: 0.9079
Avg Perplexity: 25.973

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.5355 | Avg Accuracy: 46.04 | Model Sparsity: 0.9079
Avg Perplexity: 24.649

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.4520 | Avg Accuracy: 46.74 | Model Sparsity: 0.9079
Avg Perplexity: 24.153

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.3873 | Avg Accuracy: 47.35 | Model Sparsity: 0.9079
Avg Perplexity: 23.530

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.3271 | Avg Accuracy: 47.26 | Model Sparsity: 0.9079
Avg Perplexity: 23.309



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.2851 | Avg Accuracy: 47.94 | Model Sparsity: 0.9079
Avg Perplexity: 22.767

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.2579 | Avg Accuracy: 48.09 | Model Sparsity: 0.9079
Avg Perplexity: 22.525

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.2068 | Avg Accuracy: 47.98 | Model Sparsity: 0.9079
Avg Perplexity: 22.912



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.962.



                                                                                                   

Training epoch [4/5]: Avg Loss: 3.7917 | Avg Accuracy: 39.94 | Model Sparsity: 0.9622
Avg Perplexity: 45.669

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.0389 | Avg Accuracy: 41.73 | Model Sparsity: 0.9622
Avg Perplexity: 38.556

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.8591 | Avg Accuracy: 42.22 | Model Sparsity: 0.9622
Avg Perplexity: 35.196

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.7427 | Avg Accuracy: 43.42 | Model Sparsity: 0.9622
Avg Perplexity: 32.899

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.6591 | Avg Accuracy: 44.03 | Model Sparsity: 0.9622
Avg Perplexity: 33.174

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.5889 | Avg Accuracy: 43.90 | Model Sparsity: 0.9622
Avg Perplexity: 33.173



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.5411 | Avg Accuracy: 44.83 | Model Sparsity: 0.9622
Avg Perplexity: 30.692

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.4841 | Avg Accuracy: 44.87 | Model Sparsity: 0.9622
Avg Perplexity: 30.255

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4499 | Avg Accuracy: 45.20 | Model Sparsity: 0.9622
Avg Perplexity: 30.556

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.4136 | Avg Accuracy: 45.14 | Model Sparsity: 0.9622
Avg Perplexity: 30.977



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.3861 | Avg Accuracy: 45.20 | Model Sparsity: 0.9622
Avg Perplexity: 29.824



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.970.



                                                                                                 

Training epoch [5/5]: Avg Loss: 2.5524 | Avg Accuracy: 44.34 | Model Sparsity: 0.97
Avg Perplexity: 31.743

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.4248 | Avg Accuracy: 44.83 | Model Sparsity: 0.97
Avg Perplexity: 32.290

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.3671 | Avg Accuracy: 45.13 | Model Sparsity: 0.97
Avg Perplexity: 31.257

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.3319 | Avg Accuracy: 44.91 | Model Sparsity: 0.97
Avg Perplexity: 31.233



                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.3007 | Avg Accuracy: 45.14 | Model Sparsity: 0.97
Avg Perplexity: 31.346

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.2741 | Avg Accuracy: 45.32 | Model Sparsity: 0.97
Avg Perplexity: 31.390

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.2504 | Avg Accuracy: 45.03 | Model Sparsity: 0.97
Avg Perplexity: 32.669



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.2232 | Avg Accuracy: 45.52 | Model Sparsity: 0.97
Avg Perplexity: 31.533

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.1948 | Avg Accuracy: 45.56 | Model Sparsity: 0.97
Avg Perplexity: 31.921

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.1798 | Avg Accuracy: 45.93 | Model Sparsity: 0.97
Avg Perplexity: 31.103

[TRAINER] weights saved!


                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.1564 | Avg Accuracy: 45.40 | Model Sparsity: 0.97
Avg Perplexity: 31.313

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     45.43%
  Perplexity:   31.024

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [23]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/magnitude_pruning/0.99/run_1.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.483.



                                                                                                   

Training epoch [1/5]: Avg Loss: 2.1525 | Avg Accuracy: 58.95 | Model Sparsity: 0.4831
Avg Perplexity: 8.258

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 1.9669 | Avg Accuracy: 58.62 | Model Sparsity: 0.4831
Avg Perplexity: 8.309



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 1.9106 | Avg Accuracy: 58.92 | Model Sparsity: 0.4831
Avg Perplexity: 8.204



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 1.8703 | Avg Accuracy: 58.39 | Model Sparsity: 0.4831
Avg Perplexity: 8.602



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.8413 | Avg Accuracy: 59.11 | Model Sparsity: 0.4831
Avg Perplexity: 8.500

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.8108 | Avg Accuracy: 58.76 | Model Sparsity: 0.4831
Avg Perplexity: 8.671



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.7808 | Avg Accuracy: 58.06 | Model Sparsity: 0.4831
Avg Perplexity: 8.909



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.7560 | Avg Accuracy: 58.86 | Model Sparsity: 0.4831
Avg Perplexity: 8.611



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.7201 | Avg Accuracy: 58.22 | Model Sparsity: 0.4831
Avg Perplexity: 9.124



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.6932 | Avg Accuracy: 58.76 | Model Sparsity: 0.4831
Avg Perplexity: 8.836



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.6726 | Avg Accuracy: 58.31 | Model Sparsity: 0.4831
Avg Perplexity: 9.205



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.776.



                                                                                                   

Training epoch [2/5]: Avg Loss: 3.4774 | Avg Accuracy: 47.61 | Model Sparsity: 0.7762
Avg Perplexity: 20.115

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.5215 | Avg Accuracy: 49.84 | Model Sparsity: 0.7762
Avg Perplexity: 17.240

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.3305 | Avg Accuracy: 50.91 | Model Sparsity: 0.7762
Avg Perplexity: 16.149

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.2148 | Avg Accuracy: 51.77 | Model Sparsity: 0.7762
Avg Perplexity: 15.401

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.1262 | Avg Accuracy: 52.31 | Model Sparsity: 0.7762
Avg Perplexity: 14.665

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.0692 | Avg Accuracy: 52.48 | Model Sparsity: 0.7762
Avg Perplexity: 15.170

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.0032 | Avg Accuracy: 52.83 | Model Sparsity: 0.7762
Avg Perplexity: 14.703

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.9643 | Avg Accuracy: 53.09 | Model Sparsity: 0.7762
Avg Perplexity: 14.270

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.9350 | Avg Accuracy: 53.55 | Model Sparsity: 0.7762
Avg Perplexity: 13.750

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.8718 | Avg Accuracy: 53.43 | Model Sparsity: 0.7762
Avg Perplexity: 14.182



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.8551 | Avg Accuracy: 53.18 | Model Sparsity: 0.7762
Avg Perplexity: 14.469



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.927.



                                                                                                   

Training epoch [3/5]: Avg Loss: 4.6167 | Avg Accuracy: 36.24 | Model Sparsity: 0.9266
Avg Perplexity: 58.390

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.4456 | Avg Accuracy: 39.86 | Model Sparsity: 0.9266
Avg Perplexity: 41.279

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 3.0991 | Avg Accuracy: 41.41 | Model Sparsity: 0.9266
Avg Perplexity: 35.749

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.9006 | Avg Accuracy: 42.55 | Model Sparsity: 0.9266
Avg Perplexity: 33.453

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.7785 | Avg Accuracy: 44.05 | Model Sparsity: 0.9266
Avg Perplexity: 30.366

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.6761 | Avg Accuracy: 44.65 | Model Sparsity: 0.9266
Avg Perplexity: 29.066

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.5884 | Avg Accuracy: 44.91 | Model Sparsity: 0.9266
Avg Perplexity: 28.207

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.5377 | Avg Accuracy: 45.80 | Model Sparsity: 0.9266
Avg Perplexity: 26.770

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4849 | Avg Accuracy: 45.54 | Model Sparsity: 0.9266
Avg Perplexity: 26.585



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.4357 | Avg Accuracy: 46.45 | Model Sparsity: 0.9266
Avg Perplexity: 26.020

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.3757 | Avg Accuracy: 46.04 | Model Sparsity: 0.9266
Avg Perplexity: 27.047



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.982.



                                                                                                   

Training epoch [4/5]: Avg Loss: 5.0407 | Avg Accuracy: 32.53 | Model Sparsity: 0.9821
Avg Perplexity: 93.234

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.8775 | Avg Accuracy: 35.50 | Model Sparsity: 0.9821
Avg Perplexity: 69.115

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 3.5158 | Avg Accuracy: 37.32 | Model Sparsity: 0.9821
Avg Perplexity: 58.174

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 3.3285 | Avg Accuracy: 37.79 | Model Sparsity: 0.9821
Avg Perplexity: 54.572

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 3.1844 | Avg Accuracy: 38.75 | Model Sparsity: 0.9821
Avg Perplexity: 50.045

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 3.0793 | Avg Accuracy: 39.80 | Model Sparsity: 0.9821
Avg Perplexity: 46.795

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.9930 | Avg Accuracy: 39.80 | Model Sparsity: 0.9821
Avg Perplexity: 47.323

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.9428 | Avg Accuracy: 40.36 | Model Sparsity: 0.9821
Avg Perplexity: 46.232

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.8748 | Avg Accuracy: 40.46 | Model Sparsity: 0.9821
Avg Perplexity: 45.238

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.8193 | Avg Accuracy: 40.76 | Model Sparsity: 0.9821
Avg Perplexity: 44.006

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.7764 | Avg Accuracy: 40.48 | Model Sparsity: 0.9821
Avg Perplexity: 44.095



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.990.



                                                                                                 

Training epoch [5/5]: Avg Loss: 3.1068 | Avg Accuracy: 39.72 | Model Sparsity: 0.99
Avg Perplexity: 48.534

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.8975 | Avg Accuracy: 40.41 | Model Sparsity: 0.99
Avg Perplexity: 47.052

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.8314 | Avg Accuracy: 39.91 | Model Sparsity: 0.99
Avg Perplexity: 47.371



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.7852 | Avg Accuracy: 40.77 | Model Sparsity: 0.99
Avg Perplexity: 46.108

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.7397 | Avg Accuracy: 40.73 | Model Sparsity: 0.99
Avg Perplexity: 45.387



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.6922 | Avg Accuracy: 41.00 | Model Sparsity: 0.99
Avg Perplexity: 45.519

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.6569 | Avg Accuracy: 41.19 | Model Sparsity: 0.99
Avg Perplexity: 44.431

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.6338 | Avg Accuracy: 41.20 | Model Sparsity: 0.99
Avg Perplexity: 44.666

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.5991 | Avg Accuracy: 41.06 | Model Sparsity: 0.99
Avg Perplexity: 45.093



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.5738 | Avg Accuracy: 40.99 | Model Sparsity: 0.99
Avg Perplexity: 45.583



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.5415 | Avg Accuracy: 41.39 | Model Sparsity: 0.99
Avg Perplexity: 44.734

[TRAINER] weights saved!
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     41.43%
  Perplexity:   45.728

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





### SNIP-it Prune

In [24]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.95_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/snip_pruning/0.95/run_1.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.464.



                                                                                                   

Training epoch [1/5]: Avg Loss: 3.1116 | Avg Accuracy: 54.65 | Model Sparsity: 0.4636
Avg Perplexity: 10.971

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.2774 | Avg Accuracy: 55.46 | Model Sparsity: 0.4636
Avg Perplexity: 10.289

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.1701 | Avg Accuracy: 56.25 | Model Sparsity: 0.4636
Avg Perplexity: 9.892

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.1112 | Avg Accuracy: 56.50 | Model Sparsity: 0.4636
Avg Perplexity: 9.923

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.0501 | Avg Accuracy: 56.54 | Model Sparsity: 0.4636
Avg Perplexity: 9.821

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.9977 | Avg Accuracy: 56.30 | Model Sparsity: 0.4636
Avg Perplexity: 10.059



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.9661 | Avg Accuracy: 56.21 | Model Sparsity: 0.4636
Avg Perplexity: 10.224



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.9251 | Avg Accuracy: 55.88 | Model Sparsity: 0.4636
Avg Perplexity: 10.372



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8914 | Avg Accuracy: 56.33 | Model Sparsity: 0.4636
Avg Perplexity: 10.406



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.8657 | Avg Accuracy: 56.31 | Model Sparsity: 0.4636
Avg Perplexity: 10.505



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.8299 | Avg Accuracy: 56.19 | Model Sparsity: 0.4636
Avg Perplexity: 10.779



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.745.



                                                                                                   

Training epoch [2/5]: Avg Loss: 3.0627 | Avg Accuracy: 48.23 | Model Sparsity: 0.7448
Avg Perplexity: 19.693

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.4806 | Avg Accuracy: 49.81 | Model Sparsity: 0.7448
Avg Perplexity: 17.391

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.3360 | Avg Accuracy: 51.06 | Model Sparsity: 0.7448
Avg Perplexity: 16.394

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.2474 | Avg Accuracy: 50.97 | Model Sparsity: 0.7448
Avg Perplexity: 16.281



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.1801 | Avg Accuracy: 51.48 | Model Sparsity: 0.7448
Avg Perplexity: 15.822

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.1093 | Avg Accuracy: 51.20 | Model Sparsity: 0.7448
Avg Perplexity: 15.756



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.0710 | Avg Accuracy: 51.73 | Model Sparsity: 0.7448
Avg Perplexity: 15.658

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.0328 | Avg Accuracy: 51.99 | Model Sparsity: 0.7448
Avg Perplexity: 15.587

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.9941 | Avg Accuracy: 52.51 | Model Sparsity: 0.7448
Avg Perplexity: 15.074

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.9492 | Avg Accuracy: 52.27 | Model Sparsity: 0.7448
Avg Perplexity: 15.494



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.9174 | Avg Accuracy: 52.21 | Model Sparsity: 0.7448
Avg Perplexity: 15.165



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.889.



                                                                                                   

Training epoch [3/5]: Avg Loss: 3.7220 | Avg Accuracy: 41.49 | Model Sparsity: 0.8892
Avg Perplexity: 37.007

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.8125 | Avg Accuracy: 44.42 | Model Sparsity: 0.8892
Avg Perplexity: 29.176

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.6087 | Avg Accuracy: 45.41 | Model Sparsity: 0.8892
Avg Perplexity: 26.892

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.4882 | Avg Accuracy: 45.90 | Model Sparsity: 0.8892
Avg Perplexity: 25.831

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.4106 | Avg Accuracy: 46.41 | Model Sparsity: 0.8892
Avg Perplexity: 24.846

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.3526 | Avg Accuracy: 47.01 | Model Sparsity: 0.8892
Avg Perplexity: 24.747

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.2927 | Avg Accuracy: 47.62 | Model Sparsity: 0.8892
Avg Perplexity: 23.444

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.2359 | Avg Accuracy: 47.08 | Model Sparsity: 0.8892
Avg Perplexity: 24.339



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.2005 | Avg Accuracy: 48.11 | Model Sparsity: 0.8892
Avg Perplexity: 23.050

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.1734 | Avg Accuracy: 48.40 | Model Sparsity: 0.8892
Avg Perplexity: 23.351

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.1394 | Avg Accuracy: 48.10 | Model Sparsity: 0.8892
Avg Perplexity: 22.738



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.942.



                                                                                                   

Training epoch [4/5]: Avg Loss: 3.2750 | Avg Accuracy: 41.69 | Model Sparsity: 0.9424
Avg Perplexity: 38.337

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.7222 | Avg Accuracy: 42.46 | Model Sparsity: 0.9424
Avg Perplexity: 35.703

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.5916 | Avg Accuracy: 43.78 | Model Sparsity: 0.9424
Avg Perplexity: 32.160

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.5037 | Avg Accuracy: 44.06 | Model Sparsity: 0.9424
Avg Perplexity: 31.972

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.4370 | Avg Accuracy: 44.35 | Model Sparsity: 0.9424
Avg Perplexity: 31.915

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.3872 | Avg Accuracy: 44.39 | Model Sparsity: 0.9424
Avg Perplexity: 32.352

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.3430 | Avg Accuracy: 44.61 | Model Sparsity: 0.9424
Avg Perplexity: 31.698

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.2926 | Avg Accuracy: 44.64 | Model Sparsity: 0.9424
Avg Perplexity: 31.559

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.2712 | Avg Accuracy: 45.45 | Model Sparsity: 0.9424
Avg Perplexity: 30.926

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.2370 | Avg Accuracy: 45.54 | Model Sparsity: 0.9424
Avg Perplexity: 30.239

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.2080 | Avg Accuracy: 45.72 | Model Sparsity: 0.9424
Avg Perplexity: 30.381

[TRAINER] weights saved!


Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.950.



                                                                                                 

Training epoch [5/5]: Avg Loss: 2.3540 | Avg Accuracy: 44.61 | Model Sparsity: 0.95
Avg Perplexity: 33.015

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.2769 | Avg Accuracy: 44.95 | Model Sparsity: 0.95
Avg Perplexity: 33.025

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.2327 | Avg Accuracy: 45.13 | Model Sparsity: 0.95
Avg Perplexity: 32.599

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.2072 | Avg Accuracy: 45.00 | Model Sparsity: 0.95
Avg Perplexity: 32.656



                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.1747 | Avg Accuracy: 44.46 | Model Sparsity: 0.95
Avg Perplexity: 33.569



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.1447 | Avg Accuracy: 44.81 | Model Sparsity: 0.95
Avg Perplexity: 32.656



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.1146 | Avg Accuracy: 45.40 | Model Sparsity: 0.95
Avg Perplexity: 31.938

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.0946 | Avg Accuracy: 45.23 | Model Sparsity: 0.95
Avg Perplexity: 34.098



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.0720 | Avg Accuracy: 45.48 | Model Sparsity: 0.95
Avg Perplexity: 32.527

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.0649 | Avg Accuracy: 45.18 | Model Sparsity: 0.95
Avg Perplexity: 34.410



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.0418 | Avg Accuracy: 45.55 | Model Sparsity: 0.95
Avg Perplexity: 33.867

[TRAINER] weights saved!
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     45.71%
  Perplexity:   33.934

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [25]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.97
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.97_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/snip_pruning/0.97/run_1.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.473.



                                                                                                   

Training epoch [1/5]: Avg Loss: 3.2142 | Avg Accuracy: 54.11 | Model Sparsity: 0.4734
Avg Perplexity: 11.187

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.2967 | Avg Accuracy: 54.88 | Model Sparsity: 0.4734
Avg Perplexity: 10.666

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.1933 | Avg Accuracy: 56.14 | Model Sparsity: 0.4734
Avg Perplexity: 10.027

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.1233 | Avg Accuracy: 55.71 | Model Sparsity: 0.4734
Avg Perplexity: 10.307



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.0610 | Avg Accuracy: 56.01 | Model Sparsity: 0.4734
Avg Perplexity: 10.072



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.0092 | Avg Accuracy: 55.45 | Model Sparsity: 0.4734
Avg Perplexity: 10.533



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.9787 | Avg Accuracy: 56.47 | Model Sparsity: 0.4734
Avg Perplexity: 10.102

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.9303 | Avg Accuracy: 56.47 | Model Sparsity: 0.4734
Avg Perplexity: 10.149



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8951 | Avg Accuracy: 56.20 | Model Sparsity: 0.4734
Avg Perplexity: 10.550



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.8677 | Avg Accuracy: 56.25 | Model Sparsity: 0.4734
Avg Perplexity: 10.454



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.8331 | Avg Accuracy: 56.28 | Model Sparsity: 0.4734
Avg Perplexity: 10.547



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.760.



                                                                                                   

Training epoch [2/5]: Avg Loss: 3.1628 | Avg Accuracy: 47.62 | Model Sparsity: 0.7605
Avg Perplexity: 20.471

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.5422 | Avg Accuracy: 49.54 | Model Sparsity: 0.7605
Avg Perplexity: 17.575

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.3837 | Avg Accuracy: 50.50 | Model Sparsity: 0.7605
Avg Perplexity: 16.732

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.2794 | Avg Accuracy: 51.43 | Model Sparsity: 0.7605
Avg Perplexity: 15.898

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.2048 | Avg Accuracy: 51.33 | Model Sparsity: 0.7605
Avg Perplexity: 15.992



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.1541 | Avg Accuracy: 51.49 | Model Sparsity: 0.7605
Avg Perplexity: 15.719

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.1014 | Avg Accuracy: 51.74 | Model Sparsity: 0.7605
Avg Perplexity: 15.620

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.0582 | Avg Accuracy: 51.37 | Model Sparsity: 0.7605
Avg Perplexity: 16.204



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.0203 | Avg Accuracy: 51.76 | Model Sparsity: 0.7605
Avg Perplexity: 15.841

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.9864 | Avg Accuracy: 51.66 | Model Sparsity: 0.7605
Avg Perplexity: 15.900



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.9567 | Avg Accuracy: 52.22 | Model Sparsity: 0.7605
Avg Perplexity: 15.506

[TRAINER] weights saved!


Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.908.



                                                                                                   

Training epoch [3/5]: Avg Loss: 4.2150 | Avg Accuracy: 38.92 | Model Sparsity: 0.9079
Avg Perplexity: 45.921

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.0354 | Avg Accuracy: 41.97 | Model Sparsity: 0.9079
Avg Perplexity: 35.414

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.7930 | Avg Accuracy: 43.45 | Model Sparsity: 0.9079
Avg Perplexity: 30.859

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.6553 | Avg Accuracy: 44.76 | Model Sparsity: 0.9079
Avg Perplexity: 29.426

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.5438 | Avg Accuracy: 44.83 | Model Sparsity: 0.9079
Avg Perplexity: 28.050

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.4742 | Avg Accuracy: 45.36 | Model Sparsity: 0.9079
Avg Perplexity: 27.816

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.4077 | Avg Accuracy: 46.22 | Model Sparsity: 0.9079
Avg Perplexity: 26.490

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.3511 | Avg Accuracy: 46.15 | Model Sparsity: 0.9079
Avg Perplexity: 26.421



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.3055 | Avg Accuracy: 46.71 | Model Sparsity: 0.9079
Avg Perplexity: 25.564

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.2745 | Avg Accuracy: 47.22 | Model Sparsity: 0.9079
Avg Perplexity: 25.406

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.2285 | Avg Accuracy: 47.46 | Model Sparsity: 0.9079
Avg Perplexity: 24.665

[TRAINER] weights saved!


Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.962.



                                                                                                   

Training epoch [4/5]: Avg Loss: 4.0106 | Avg Accuracy: 37.43 | Model Sparsity: 0.9622
Avg Perplexity: 57.295

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.0915 | Avg Accuracy: 39.91 | Model Sparsity: 0.9622
Avg Perplexity: 46.122

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.8824 | Avg Accuracy: 40.74 | Model Sparsity: 0.9622
Avg Perplexity: 44.877

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.7697 | Avg Accuracy: 41.57 | Model Sparsity: 0.9622
Avg Perplexity: 39.900

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.6926 | Avg Accuracy: 40.91 | Model Sparsity: 0.9622
Avg Perplexity: 41.479



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.6192 | Avg Accuracy: 41.82 | Model Sparsity: 0.9622
Avg Perplexity: 39.924

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.5635 | Avg Accuracy: 41.95 | Model Sparsity: 0.9622
Avg Perplexity: 38.989

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.5161 | Avg Accuracy: 42.21 | Model Sparsity: 0.9622
Avg Perplexity: 38.730

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4823 | Avg Accuracy: 43.06 | Model Sparsity: 0.9622
Avg Perplexity: 37.904

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.4339 | Avg Accuracy: 43.09 | Model Sparsity: 0.9622
Avg Perplexity: 37.615

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.4140 | Avg Accuracy: 43.09 | Model Sparsity: 0.9622
Avg Perplexity: 37.370

[TRAINER] weights saved!


Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.970.



                                                                                                 

Training epoch [5/5]: Avg Loss: 2.6274 | Avg Accuracy: 41.89 | Model Sparsity: 0.97
Avg Perplexity: 41.524

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.5146 | Avg Accuracy: 42.14 | Model Sparsity: 0.97
Avg Perplexity: 40.748

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.4584 | Avg Accuracy: 41.94 | Model Sparsity: 0.97
Avg Perplexity: 41.019



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.4268 | Avg Accuracy: 43.23 | Model Sparsity: 0.97
Avg Perplexity: 39.077

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.3801 | Avg Accuracy: 42.91 | Model Sparsity: 0.97
Avg Perplexity: 38.380



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.3508 | Avg Accuracy: 42.99 | Model Sparsity: 0.97
Avg Perplexity: 39.374



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.3325 | Avg Accuracy: 42.53 | Model Sparsity: 0.97
Avg Perplexity: 40.721



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.3090 | Avg Accuracy: 42.98 | Model Sparsity: 0.97
Avg Perplexity: 40.313



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.2772 | Avg Accuracy: 42.78 | Model Sparsity: 0.97
Avg Perplexity: 41.284



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.2473 | Avg Accuracy: 43.18 | Model Sparsity: 0.97
Avg Perplexity: 40.265



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.2419 | Avg Accuracy: 43.02 | Model Sparsity: 0.97
Avg Perplexity: 40.895

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     42.28%
  Perplexity:   40.321

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [8]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.99_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/pruning/snip_pruning/0.99/run_1.log
[TRAINER] Training with mixed precision enabled
[TRAINER] Initial model sparsity: 0.0


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.483.



                                                                                                   

Training epoch [1/5]: Avg Loss: 3.2108 | Avg Accuracy: 54.77 | Model Sparsity: 0.4831
Avg Perplexity: 10.788

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.2834 | Avg Accuracy: 56.11 | Model Sparsity: 0.4831
Avg Perplexity: 9.986

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.1737 | Avg Accuracy: 56.06 | Model Sparsity: 0.4831
Avg Perplexity: 10.251



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.1020 | Avg Accuracy: 55.77 | Model Sparsity: 0.4831
Avg Perplexity: 10.433



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.0514 | Avg Accuracy: 56.28 | Model Sparsity: 0.4831
Avg Perplexity: 10.219

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.9972 | Avg Accuracy: 55.90 | Model Sparsity: 0.4831
Avg Perplexity: 10.217



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.9568 | Avg Accuracy: 56.25 | Model Sparsity: 0.4831
Avg Perplexity: 10.352



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.9170 | Avg Accuracy: 56.36 | Model Sparsity: 0.4831
Avg Perplexity: 10.368

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8773 | Avg Accuracy: 56.40 | Model Sparsity: 0.4831
Avg Perplexity: 10.433

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.8497 | Avg Accuracy: 56.16 | Model Sparsity: 0.4831
Avg Perplexity: 10.558



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.8214 | Avg Accuracy: 56.07 | Model Sparsity: 0.4831
Avg Perplexity: 10.730



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.776.



                                                                                                   

Training epoch [2/5]: Avg Loss: 3.3308 | Avg Accuracy: 45.97 | Model Sparsity: 0.7762
Avg Perplexity: 23.434

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.6132 | Avg Accuracy: 48.47 | Model Sparsity: 0.7762
Avg Perplexity: 19.344

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.4428 | Avg Accuracy: 49.83 | Model Sparsity: 0.7762
Avg Perplexity: 17.905

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.3286 | Avg Accuracy: 50.48 | Model Sparsity: 0.7762
Avg Perplexity: 17.133

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.2603 | Avg Accuracy: 50.68 | Model Sparsity: 0.7762
Avg Perplexity: 16.858

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.2015 | Avg Accuracy: 50.92 | Model Sparsity: 0.7762
Avg Perplexity: 16.392

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.1372 | Avg Accuracy: 50.56 | Model Sparsity: 0.7762
Avg Perplexity: 16.774



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.0954 | Avg Accuracy: 51.41 | Model Sparsity: 0.7762
Avg Perplexity: 16.421

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.0580 | Avg Accuracy: 51.06 | Model Sparsity: 0.7762
Avg Perplexity: 16.819



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.0112 | Avg Accuracy: 51.58 | Model Sparsity: 0.7762
Avg Perplexity: 16.050

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.9768 | Avg Accuracy: 52.11 | Model Sparsity: 0.7762
Avg Perplexity: 16.224

[TRAINER] weights saved!


Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.927.



                                                                                                   

Training epoch [3/5]: Avg Loss: 5.2125 | Avg Accuracy: 34.50 | Model Sparsity: 0.9266
Avg Perplexity: 69.326

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.4348 | Avg Accuracy: 38.42 | Model Sparsity: 0.9266
Avg Perplexity: 47.387

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 3.0782 | Avg Accuracy: 40.40 | Model Sparsity: 0.9266
Avg Perplexity: 40.474

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.8964 | Avg Accuracy: 41.45 | Model Sparsity: 0.9266
Avg Perplexity: 37.189

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.7604 | Avg Accuracy: 42.39 | Model Sparsity: 0.9266
Avg Perplexity: 34.987

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.6643 | Avg Accuracy: 43.07 | Model Sparsity: 0.9266
Avg Perplexity: 33.629

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.6005 | Avg Accuracy: 43.90 | Model Sparsity: 0.9266
Avg Perplexity: 30.263

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.5243 | Avg Accuracy: 44.03 | Model Sparsity: 0.9266
Avg Perplexity: 31.949

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4541 | Avg Accuracy: 44.64 | Model Sparsity: 0.9266
Avg Perplexity: 30.533

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.4144 | Avg Accuracy: 45.03 | Model Sparsity: 0.9266
Avg Perplexity: 28.375

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.3794 | Avg Accuracy: 45.11 | Model Sparsity: 0.9266
Avg Perplexity: 29.263

[TRAINER] weights saved!


Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.982.



                                                                                                   

Training epoch [4/5]: Avg Loss: 6.1548 | Avg Accuracy: 21.47 | Model Sparsity: 0.9821
Avg Perplexity: 304.875

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 4.6266 | Avg Accuracy: 31.91 | Model Sparsity: 0.9821
Avg Perplexity: 94.822

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 3.7633 | Avg Accuracy: 34.93 | Model Sparsity: 0.9821
Avg Perplexity: 73.106

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 3.4953 | Avg Accuracy: 35.66 | Model Sparsity: 0.9821
Avg Perplexity: 67.201

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 3.3316 | Avg Accuracy: 36.45 | Model Sparsity: 0.9821
Avg Perplexity: 64.822

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 3.2151 | Avg Accuracy: 36.89 | Model Sparsity: 0.9821
Avg Perplexity: 61.915

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 3.1245 | Avg Accuracy: 37.05 | Model Sparsity: 0.9821
Avg Perplexity: 60.597

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 3.0443 | Avg Accuracy: 37.39 | Model Sparsity: 0.9821
Avg Perplexity: 59.262

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.9896 | Avg Accuracy: 37.62 | Model Sparsity: 0.9821
Avg Perplexity: 58.464

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.9229 | Avg Accuracy: 38.07 | Model Sparsity: 0.9821
Avg Perplexity: 57.457

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.8732 | Avg Accuracy: 38.23 | Model Sparsity: 0.9821
Avg Perplexity: 57.370

[TRAINER] weights saved!


Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.990.



                                                                                                 

Training epoch [5/5]: Avg Loss: 3.3654 | Avg Accuracy: 35.89 | Model Sparsity: 0.99
Avg Perplexity: 69.241

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 3.1115 | Avg Accuracy: 36.90 | Model Sparsity: 0.99
Avg Perplexity: 63.466

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 3.0238 | Avg Accuracy: 37.70 | Model Sparsity: 0.99
Avg Perplexity: 62.984

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.9634 | Avg Accuracy: 37.43 | Model Sparsity: 0.99
Avg Perplexity: 60.943



                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.9161 | Avg Accuracy: 37.46 | Model Sparsity: 0.99
Avg Perplexity: 64.173



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.8649 | Avg Accuracy: 37.63 | Model Sparsity: 0.99
Avg Perplexity: 63.814



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.8373 | Avg Accuracy: 37.45 | Model Sparsity: 0.99
Avg Perplexity: 63.053



                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.8021 | Avg Accuracy: 38.41 | Model Sparsity: 0.99
Avg Perplexity: 61.215

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.7686 | Avg Accuracy: 37.81 | Model Sparsity: 0.99
Avg Perplexity: 62.193



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.7368 | Avg Accuracy: 38.30 | Model Sparsity: 0.99
Avg Perplexity: 61.324



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.7098 | Avg Accuracy: 38.54 | Model Sparsity: 0.99
Avg Perplexity: 60.417

[TRAINER] weights saved!
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     37.80%
  Perplexity:   65.203

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





### WandA Prune

In [4]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
.weight
model.weight
model.activation.weight
model.distilbert.weight
model.distilbert.embeddings.weight
model.distilbert.embeddings.word_embeddings.weight
model.distilbert.embeddings.position_embeddings.weight
model.distilbert.embeddings.LayerNorm.weight
model.distilbert.embeddings.dropout.weight
model.distilbert.transformer.weight
model.distilbert.transformer.layer.weight
model.distilbert.transformer.layer.0.weight
model.distilbert.transformer.layer.0.attention.weight
model.distilbert.transformer.layer.0.attention.dropout.weight
model.distilbert.transformer.layer.0.attention.q_lin.weight

Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.464.



Training Epoch [1/5]:   4%|▍         | 3/71 [00:01<00:25,  2.69it/s, Loss=2.8433, Sparsity=0.4636]


[Pruner] Removing hooks


                                                                                                   

Training epoch [1/5]: Avg Loss: 2.3055 | Avg Accuracy: 57.02 | Model Sparsity: 0.4636
Avg Perplexity: 9.480

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.1244 | Avg Accuracy: 56.51 | Model Sparsity: 0.4636
Avg Perplexity: 9.669



                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.0593 | Avg Accuracy: 57.09 | Model Sparsity: 0.4636
Avg Perplexity: 9.501

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.0087 | Avg Accuracy: 57.29 | Model Sparsity: 0.4636
Avg Perplexity: 9.374

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.9701 | Avg Accuracy: 56.79 | Model Sparsity: 0.4636
Avg Perplexity: 9.958



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.9286 | Avg Accuracy: 57.52 | Model Sparsity: 0.4636
Avg Perplexity: 9.473

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.8968 | Avg Accuracy: 57.20 | Model Sparsity: 0.4636
Avg Perplexity: 9.832



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.8575 | Avg Accuracy: 57.37 | Model Sparsity: 0.4636
Avg Perplexity: 9.845



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8153 | Avg Accuracy: 57.19 | Model Sparsity: 0.4636
Avg Perplexity: 9.845



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.7880 | Avg Accuracy: 56.91 | Model Sparsity: 0.4636
Avg Perplexity: 10.149



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.7586 | Avg Accuracy: 57.48 | Model Sparsity: 0.4636
Avg Perplexity: 9.985

[Pruner] Adding hooks


Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.745.



Training Epoch [2/5]:   4%|▍         | 3/71 [00:00<00:12,  5.33it/s, Loss=4.7526, Sparsity=0.7448]


[Pruner] Removing hooks


                                                                                                   

Training epoch [2/5]: Avg Loss: 3.7233 | Avg Accuracy: 43.52 | Model Sparsity: 0.7448
Avg Perplexity: 31.678

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.9985 | Avg Accuracy: 46.35 | Model Sparsity: 0.7448
Avg Perplexity: 25.396

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.7924 | Avg Accuracy: 47.14 | Model Sparsity: 0.7448
Avg Perplexity: 23.211

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.6535 | Avg Accuracy: 47.78 | Model Sparsity: 0.7448
Avg Perplexity: 21.904

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.5692 | Avg Accuracy: 48.26 | Model Sparsity: 0.7448
Avg Perplexity: 21.705

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.4851 | Avg Accuracy: 48.54 | Model Sparsity: 0.7448
Avg Perplexity: 20.797

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.4254 | Avg Accuracy: 48.62 | Model Sparsity: 0.7448
Avg Perplexity: 21.076

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.3693 | Avg Accuracy: 49.55 | Model Sparsity: 0.7448
Avg Perplexity: 20.002

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.3026 | Avg Accuracy: 49.42 | Model Sparsity: 0.7448
Avg Perplexity: 20.238



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.2680 | Avg Accuracy: 49.66 | Model Sparsity: 0.7448
Avg Perplexity: 20.080

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.2349 | Avg Accuracy: 49.61 | Model Sparsity: 0.7448
Avg Perplexity: 20.262

[Pruner] Adding hooks


Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.889.



Training Epoch [3/5]:   4%|▍         | 3/71 [00:00<00:12,  5.43it/s, Loss=4.6300, Sparsity=0.8892]


[Pruner] Removing hooks


                                                                                                   

Training epoch [3/5]: Avg Loss: 3.7366 | Avg Accuracy: 40.42 | Model Sparsity: 0.8892
Avg Perplexity: 41.560

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.0298 | Avg Accuracy: 42.38 | Model Sparsity: 0.8892
Avg Perplexity: 35.517

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.8250 | Avg Accuracy: 43.34 | Model Sparsity: 0.8892
Avg Perplexity: 32.784

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.7136 | Avg Accuracy: 44.61 | Model Sparsity: 0.8892
Avg Perplexity: 30.047

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.6274 | Avg Accuracy: 44.90 | Model Sparsity: 0.8892
Avg Perplexity: 30.259

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.5638 | Avg Accuracy: 45.62 | Model Sparsity: 0.8892
Avg Perplexity: 29.414

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.5124 | Avg Accuracy: 46.03 | Model Sparsity: 0.8892
Avg Perplexity: 28.328

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.4560 | Avg Accuracy: 45.69 | Model Sparsity: 0.8892
Avg Perplexity: 28.479



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4235 | Avg Accuracy: 46.19 | Model Sparsity: 0.8892
Avg Perplexity: 27.880

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.3785 | Avg Accuracy: 45.84 | Model Sparsity: 0.8892
Avg Perplexity: 28.266



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.3435 | Avg Accuracy: 46.37 | Model Sparsity: 0.8892
Avg Perplexity: 27.362

[TRAINER] weights saved!
[Pruner] Adding hooks


Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.942.



Training Epoch [4/5]:   4%|▍         | 3/71 [00:00<00:12,  5.34it/s, Loss=3.2010, Sparsity=0.9424]


[Pruner] Removing hooks


                                                                                                   

Training epoch [4/5]: Avg Loss: 2.8605 | Avg Accuracy: 42.79 | Model Sparsity: 0.9424
Avg Perplexity: 36.851

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.6078 | Avg Accuracy: 43.74 | Model Sparsity: 0.9424
Avg Perplexity: 33.354

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.5264 | Avg Accuracy: 44.04 | Model Sparsity: 0.9424
Avg Perplexity: 34.507

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.4747 | Avg Accuracy: 44.26 | Model Sparsity: 0.9424
Avg Perplexity: 33.282

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.4121 | Avg Accuracy: 44.61 | Model Sparsity: 0.9424
Avg Perplexity: 32.778

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.3824 | Avg Accuracy: 44.66 | Model Sparsity: 0.9424
Avg Perplexity: 32.921

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.3405 | Avg Accuracy: 45.23 | Model Sparsity: 0.9424
Avg Perplexity: 31.420

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.3152 | Avg Accuracy: 44.93 | Model Sparsity: 0.9424
Avg Perplexity: 32.400



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.2874 | Avg Accuracy: 44.95 | Model Sparsity: 0.9424
Avg Perplexity: 32.147



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.2647 | Avg Accuracy: 45.42 | Model Sparsity: 0.9424
Avg Perplexity: 31.354

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.2414 | Avg Accuracy: 45.10 | Model Sparsity: 0.9424
Avg Perplexity: 33.155

[Pruner] Adding hooks


Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.950.



Training Epoch [5/5]:   4%|▍         | 3/71 [00:00<00:12,  5.34it/s, Loss=2.3899, Sparsity=0.95]


[Pruner] Removing hooks


                                                                                                 

Training epoch [5/5]: Avg Loss: 2.3125 | Avg Accuracy: 44.87 | Model Sparsity: 0.95
Avg Perplexity: 33.100

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.2435 | Avg Accuracy: 44.92 | Model Sparsity: 0.95
Avg Perplexity: 34.513

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.2136 | Avg Accuracy: 45.59 | Model Sparsity: 0.95
Avg Perplexity: 32.076

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.1887 | Avg Accuracy: 45.03 | Model Sparsity: 0.95
Avg Perplexity: 33.859



                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.1627 | Avg Accuracy: 45.49 | Model Sparsity: 0.95
Avg Perplexity: 32.194



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.1370 | Avg Accuracy: 44.87 | Model Sparsity: 0.95
Avg Perplexity: 34.190



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.1227 | Avg Accuracy: 45.60 | Model Sparsity: 0.95
Avg Perplexity: 32.660

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.1098 | Avg Accuracy: 45.61 | Model Sparsity: 0.95
Avg Perplexity: 32.791

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.0766 | Avg Accuracy: 45.36 | Model Sparsity: 0.95
Avg Perplexity: 32.990



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.0743 | Avg Accuracy: 45.79 | Model Sparsity: 0.95
Avg Perplexity: 33.681

[TRAINER] weights saved!


                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.0494 | Avg Accuracy: 45.56 | Model Sparsity: 0.95
Avg Perplexity: 33.996

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_wanda_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     45.65%
  Perplexity:   33.869

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         wanda_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [6]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
.weight
model.weight
model.activation.weight
model.distilbert.weight
model.distilbert.embeddings.weight
model.distilbert.embeddings.word_embeddings.weight
model.distilbert.embeddings.position_embeddings.weight
model.distilbert.embeddings.LayerNorm.weight
model.distilbert.embeddings.dropout.weight
model.distilbert.transformer.weight
model.distilbert.transformer.layer.weight
model.distilbert.transformer.layer.0.weight
model.distilbert.transformer.layer.0.attention.weight
model.distilbert.transformer.layer.0.attention.dropout.weight
model.distilbert.transformer.layer.0.attention.q_lin.weight

Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.473.



Training Epoch [1/5]:   4%|▍         | 3/71 [00:01<00:19,  3.58it/s, Loss=2.4791, Sparsity=0.4734]


[Pruner] Removing hooks


                                                                                                   

Training epoch [1/5]: Avg Loss: 2.2992 | Avg Accuracy: 56.95 | Model Sparsity: 0.4734
Avg Perplexity: 9.615

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.1490 | Avg Accuracy: 57.32 | Model Sparsity: 0.4734
Avg Perplexity: 9.311

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.0823 | Avg Accuracy: 57.12 | Model Sparsity: 0.4734
Avg Perplexity: 9.537



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.0250 | Avg Accuracy: 57.15 | Model Sparsity: 0.4734
Avg Perplexity: 9.701



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 1.9879 | Avg Accuracy: 56.31 | Model Sparsity: 0.4734
Avg Perplexity: 10.124



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.9471 | Avg Accuracy: 56.81 | Model Sparsity: 0.4734
Avg Perplexity: 10.036



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.9063 | Avg Accuracy: 56.41 | Model Sparsity: 0.4734
Avg Perplexity: 10.368



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.8764 | Avg Accuracy: 56.89 | Model Sparsity: 0.4734
Avg Perplexity: 10.362



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8332 | Avg Accuracy: 57.02 | Model Sparsity: 0.4734
Avg Perplexity: 10.074



                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.7964 | Avg Accuracy: 56.80 | Model Sparsity: 0.4734
Avg Perplexity: 10.294



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.7790 | Avg Accuracy: 57.08 | Model Sparsity: 0.4734
Avg Perplexity: 10.150

[Pruner] Adding hooks


Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.760.



Training Epoch [2/5]:   4%|▍         | 3/71 [00:00<00:12,  5.31it/s, Loss=5.1436, Sparsity=0.7605]


[Pruner] Removing hooks


                                                                                                   

Training epoch [2/5]: Avg Loss: 3.9235 | Avg Accuracy: 42.49 | Model Sparsity: 0.7605
Avg Perplexity: 34.922

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.1057 | Avg Accuracy: 44.98 | Model Sparsity: 0.7605
Avg Perplexity: 28.501

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.8611 | Avg Accuracy: 46.58 | Model Sparsity: 0.7605
Avg Perplexity: 24.307

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.7251 | Avg Accuracy: 47.15 | Model Sparsity: 0.7605
Avg Perplexity: 23.992

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.6248 | Avg Accuracy: 47.30 | Model Sparsity: 0.7605
Avg Perplexity: 23.372

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.5567 | Avg Accuracy: 48.09 | Model Sparsity: 0.7605
Avg Perplexity: 22.166

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.4786 | Avg Accuracy: 48.55 | Model Sparsity: 0.7605
Avg Perplexity: 21.203

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.4105 | Avg Accuracy: 48.75 | Model Sparsity: 0.7605
Avg Perplexity: 20.956

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.3670 | Avg Accuracy: 49.00 | Model Sparsity: 0.7605
Avg Perplexity: 20.987

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.3204 | Avg Accuracy: 48.74 | Model Sparsity: 0.7605
Avg Perplexity: 21.316



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.2782 | Avg Accuracy: 49.02 | Model Sparsity: 0.7605
Avg Perplexity: 21.113

[TRAINER] weights saved!
[Pruner] Adding hooks


Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.908.



Training Epoch [3/5]:   4%|▍         | 3/71 [00:00<00:12,  5.61it/s, Loss=4.8931, Sparsity=0.9079]


[Pruner] Removing hooks


                                                                                                   

Training epoch [3/5]: Avg Loss: 4.0747 | Avg Accuracy: 37.76 | Model Sparsity: 0.9079
Avg Perplexity: 52.048

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.2453 | Avg Accuracy: 40.53 | Model Sparsity: 0.9079
Avg Perplexity: 41.592

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.9883 | Avg Accuracy: 41.89 | Model Sparsity: 0.9079
Avg Perplexity: 37.444

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.8586 | Avg Accuracy: 42.49 | Model Sparsity: 0.9079
Avg Perplexity: 36.322

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.7663 | Avg Accuracy: 43.17 | Model Sparsity: 0.9079
Avg Perplexity: 34.268

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.6871 | Avg Accuracy: 43.54 | Model Sparsity: 0.9079
Avg Perplexity: 33.775

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.6336 | Avg Accuracy: 44.49 | Model Sparsity: 0.9079
Avg Perplexity: 31.100

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.5680 | Avg Accuracy: 44.76 | Model Sparsity: 0.9079
Avg Perplexity: 31.085

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.5249 | Avg Accuracy: 45.09 | Model Sparsity: 0.9079
Avg Perplexity: 29.902

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.4885 | Avg Accuracy: 44.95 | Model Sparsity: 0.9079
Avg Perplexity: 29.941



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.4529 | Avg Accuracy: 45.01 | Model Sparsity: 0.9079
Avg Perplexity: 30.413

[Pruner] Adding hooks


Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.962.



Training Epoch [4/5]:   4%|▍         | 3/71 [00:00<00:12,  5.37it/s, Loss=3.5601, Sparsity=0.9622]


[Pruner] Removing hooks


                                                                                                   

Training epoch [4/5]: Avg Loss: 3.1491 | Avg Accuracy: 40.70 | Model Sparsity: 0.9622
Avg Perplexity: 44.789

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.8154 | Avg Accuracy: 42.01 | Model Sparsity: 0.9622
Avg Perplexity: 38.615

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.7062 | Avg Accuracy: 42.43 | Model Sparsity: 0.9622
Avg Perplexity: 38.295

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.6372 | Avg Accuracy: 42.39 | Model Sparsity: 0.9622
Avg Perplexity: 38.377



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.5836 | Avg Accuracy: 43.07 | Model Sparsity: 0.9622
Avg Perplexity: 36.518

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.5453 | Avg Accuracy: 43.43 | Model Sparsity: 0.9622
Avg Perplexity: 35.325

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.5009 | Avg Accuracy: 43.36 | Model Sparsity: 0.9622
Avg Perplexity: 37.465



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.4778 | Avg Accuracy: 43.05 | Model Sparsity: 0.9622
Avg Perplexity: 36.199



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4430 | Avg Accuracy: 43.90 | Model Sparsity: 0.9622
Avg Perplexity: 36.324

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.4063 | Avg Accuracy: 43.66 | Model Sparsity: 0.9622
Avg Perplexity: 37.442



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.3818 | Avg Accuracy: 44.01 | Model Sparsity: 0.9622
Avg Perplexity: 36.543

[TRAINER] weights saved!
[Pruner] Adding hooks


Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.970.



Training Epoch [5/5]:   4%|▍         | 3/71 [00:00<00:12,  5.31it/s, Loss=2.5155, Sparsity=0.97]


[Pruner] Removing hooks


                                                                                                 

Training epoch [5/5]: Avg Loss: 2.4815 | Avg Accuracy: 43.07 | Model Sparsity: 0.97
Avg Perplexity: 38.888

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.4050 | Avg Accuracy: 43.66 | Model Sparsity: 0.97
Avg Perplexity: 37.202

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.3686 | Avg Accuracy: 43.48 | Model Sparsity: 0.97
Avg Perplexity: 38.084



                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.3351 | Avg Accuracy: 43.87 | Model Sparsity: 0.97
Avg Perplexity: 35.876

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.3092 | Avg Accuracy: 43.60 | Model Sparsity: 0.97
Avg Perplexity: 37.634



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.2891 | Avg Accuracy: 44.01 | Model Sparsity: 0.97
Avg Perplexity: 38.474

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.2685 | Avg Accuracy: 44.27 | Model Sparsity: 0.97
Avg Perplexity: 36.833

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.2530 | Avg Accuracy: 43.50 | Model Sparsity: 0.97
Avg Perplexity: 38.572



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.2343 | Avg Accuracy: 44.08 | Model Sparsity: 0.97
Avg Perplexity: 38.199



                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.2182 | Avg Accuracy: 44.00 | Model Sparsity: 0.97
Avg Perplexity: 39.007



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.1923 | Avg Accuracy: 44.07 | Model Sparsity: 0.97
Avg Perplexity: 38.216

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_wanda_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     44.65%
  Perplexity:   36.423

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         wanda_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [7]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
.weight
model.weight
model.activation.weight
model.distilbert.weight
model.distilbert.embeddings.weight
model.distilbert.embeddings.word_embeddings.weight
model.distilbert.embeddings.position_embeddings.weight
model.distilbert.embeddings.LayerNorm.weight
model.distilbert.embeddings.dropout.weight
model.distilbert.transformer.weight
model.distilbert.transformer.layer.weight
model.distilbert.transformer.layer.0.weight
model.distilbert.transformer.layer.0.attention.weight
model.distilbert.transformer.layer.0.attention.dropout.weight
model.distilbert.transformer.layer.0.attention.q_lin.weight

Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.483.



Training Epoch [1/5]:   4%|▍         | 3/71 [00:01<00:19,  3.57it/s, Loss=2.8648, Sparsity=0.4831]


[Pruner] Removing hooks


                                                                                                   

Training epoch [1/5]: Avg Loss: 2.3681 | Avg Accuracy: 56.22 | Model Sparsity: 0.4831
Avg Perplexity: 9.950

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 2.1683 | Avg Accuracy: 56.92 | Model Sparsity: 0.4831
Avg Perplexity: 10.015

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.0955 | Avg Accuracy: 56.82 | Model Sparsity: 0.4831
Avg Perplexity: 9.739



                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.0379 | Avg Accuracy: 56.64 | Model Sparsity: 0.4831
Avg Perplexity: 9.952



                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.0011 | Avg Accuracy: 56.62 | Model Sparsity: 0.4831
Avg Perplexity: 9.755



                                                                                                    

Recovery epoch [5/10]: Avg Loss: 1.9472 | Avg Accuracy: 57.01 | Model Sparsity: 0.4831
Avg Perplexity: 9.951

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 1.9204 | Avg Accuracy: 56.78 | Model Sparsity: 0.4831
Avg Perplexity: 9.920



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 1.8813 | Avg Accuracy: 56.98 | Model Sparsity: 0.4831
Avg Perplexity: 9.948



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 1.8491 | Avg Accuracy: 57.10 | Model Sparsity: 0.4831
Avg Perplexity: 10.321

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 1.8209 | Avg Accuracy: 56.76 | Model Sparsity: 0.4831
Avg Perplexity: 10.500



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 1.7877 | Avg Accuracy: 56.57 | Model Sparsity: 0.4831
Avg Perplexity: 10.584

[Pruner] Adding hooks


Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.776.



Training Epoch [2/5]:   4%|▍         | 3/71 [00:00<00:12,  5.26it/s, Loss=5.2339, Sparsity=0.7762]


[Pruner] Removing hooks


                                                                                                   

Training epoch [2/5]: Avg Loss: 4.1171 | Avg Accuracy: 41.25 | Model Sparsity: 0.7762
Avg Perplexity: 38.577

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.2268 | Avg Accuracy: 44.13 | Model Sparsity: 0.7762
Avg Perplexity: 30.631

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 2.9779 | Avg Accuracy: 44.88 | Model Sparsity: 0.7762
Avg Perplexity: 27.495

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 2.8222 | Avg Accuracy: 46.23 | Model Sparsity: 0.7762
Avg Perplexity: 25.519

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.7124 | Avg Accuracy: 46.80 | Model Sparsity: 0.7762
Avg Perplexity: 25.062

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.6288 | Avg Accuracy: 47.67 | Model Sparsity: 0.7762
Avg Perplexity: 23.575

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.5638 | Avg Accuracy: 47.27 | Model Sparsity: 0.7762
Avg Perplexity: 23.442



                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.5071 | Avg Accuracy: 47.41 | Model Sparsity: 0.7762
Avg Perplexity: 23.558



                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.4396 | Avg Accuracy: 48.75 | Model Sparsity: 0.7762
Avg Perplexity: 21.887

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.3977 | Avg Accuracy: 48.53 | Model Sparsity: 0.7762
Avg Perplexity: 21.574



                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.3485 | Avg Accuracy: 48.30 | Model Sparsity: 0.7762
Avg Perplexity: 22.174

[Pruner] Adding hooks


Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.927.



Training Epoch [3/5]:   4%|▍         | 3/71 [00:00<00:12,  5.41it/s, Loss=5.2324, Sparsity=0.9266]


[Pruner] Removing hooks


                                                                                                   

Training epoch [3/5]: Avg Loss: 4.4529 | Avg Accuracy: 35.35 | Model Sparsity: 0.9266
Avg Perplexity: 64.212

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.4876 | Avg Accuracy: 38.31 | Model Sparsity: 0.9266
Avg Perplexity: 50.601

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 3.1976 | Avg Accuracy: 39.80 | Model Sparsity: 0.9266
Avg Perplexity: 44.621

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 3.0420 | Avg Accuracy: 40.73 | Model Sparsity: 0.9266
Avg Perplexity: 41.604

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.9287 | Avg Accuracy: 41.87 | Model Sparsity: 0.9266
Avg Perplexity: 38.117

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.8460 | Avg Accuracy: 41.63 | Model Sparsity: 0.9266
Avg Perplexity: 38.463



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.7682 | Avg Accuracy: 42.29 | Model Sparsity: 0.9266
Avg Perplexity: 36.490

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.7124 | Avg Accuracy: 42.62 | Model Sparsity: 0.9266
Avg Perplexity: 36.364

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.6626 | Avg Accuracy: 43.18 | Model Sparsity: 0.9266
Avg Perplexity: 35.241

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.6233 | Avg Accuracy: 43.27 | Model Sparsity: 0.9266
Avg Perplexity: 35.024

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.5775 | Avg Accuracy: 43.94 | Model Sparsity: 0.9266
Avg Perplexity: 33.597

[TRAINER] weights saved!
[Pruner] Adding hooks


Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.982.



Training Epoch [4/5]:   4%|▍         | 3/71 [00:00<00:11,  5.97it/s, Loss=4.9325, Sparsity=0.9821]


[Pruner] Removing hooks


                                                                                                   

Training epoch [4/5]: Avg Loss: 3.9571 | Avg Accuracy: 35.77 | Model Sparsity: 0.9821
Avg Perplexity: 66.887

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [1/10]: Avg Loss: 3.3093 | Avg Accuracy: 37.82 | Model Sparsity: 0.9821
Avg Perplexity: 56.659

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [2/10]: Avg Loss: 3.1235 | Avg Accuracy: 38.69 | Model Sparsity: 0.9821
Avg Perplexity: 52.321

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [3/10]: Avg Loss: 3.0251 | Avg Accuracy: 38.98 | Model Sparsity: 0.9821
Avg Perplexity: 51.195

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [4/10]: Avg Loss: 2.9389 | Avg Accuracy: 39.93 | Model Sparsity: 0.9821
Avg Perplexity: 47.507

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [5/10]: Avg Loss: 2.8724 | Avg Accuracy: 39.76 | Model Sparsity: 0.9821
Avg Perplexity: 48.035



                                                                                                    

Recovery epoch [6/10]: Avg Loss: 2.8166 | Avg Accuracy: 40.19 | Model Sparsity: 0.9821
Avg Perplexity: 46.672

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [7/10]: Avg Loss: 2.7798 | Avg Accuracy: 40.59 | Model Sparsity: 0.9821
Avg Perplexity: 45.829

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [8/10]: Avg Loss: 2.7307 | Avg Accuracy: 40.77 | Model Sparsity: 0.9821
Avg Perplexity: 45.440

[TRAINER] weights saved!


                                                                                                    

Recovery epoch [9/10]: Avg Loss: 2.6995 | Avg Accuracy: 40.93 | Model Sparsity: 0.9821
Avg Perplexity: 45.775

[TRAINER] weights saved!


                                                                                                     

Recovery epoch [10/10]: Avg Loss: 2.6705 | Avg Accuracy: 41.31 | Model Sparsity: 0.9821
Avg Perplexity: 44.129

[TRAINER] weights saved!
[Pruner] Adding hooks


Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.990.



Training Epoch [5/5]:   4%|▍         | 3/71 [00:00<00:12,  5.28it/s, Loss=3.0844, Sparsity=0.99]


[Pruner] Removing hooks


                                                                                                 

Training epoch [5/5]: Avg Loss: 2.9227 | Avg Accuracy: 39.71 | Model Sparsity: 0.99
Avg Perplexity: 50.265

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [1/10]: Avg Loss: 2.7724 | Avg Accuracy: 40.40 | Model Sparsity: 0.99
Avg Perplexity: 47.569

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [2/10]: Avg Loss: 2.7127 | Avg Accuracy: 40.48 | Model Sparsity: 0.99
Avg Perplexity: 47.485

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [3/10]: Avg Loss: 2.6747 | Avg Accuracy: 40.58 | Model Sparsity: 0.99
Avg Perplexity: 48.116

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [4/10]: Avg Loss: 2.6463 | Avg Accuracy: 40.06 | Model Sparsity: 0.99
Avg Perplexity: 49.328



                                                                                                  

Recovery epoch [5/10]: Avg Loss: 2.6093 | Avg Accuracy: 40.40 | Model Sparsity: 0.99
Avg Perplexity: 49.243



                                                                                                  

Recovery epoch [6/10]: Avg Loss: 2.5777 | Avg Accuracy: 40.85 | Model Sparsity: 0.99
Avg Perplexity: 47.393

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [7/10]: Avg Loss: 2.5552 | Avg Accuracy: 40.66 | Model Sparsity: 0.99
Avg Perplexity: 49.659



                                                                                                  

Recovery epoch [8/10]: Avg Loss: 2.5408 | Avg Accuracy: 41.14 | Model Sparsity: 0.99
Avg Perplexity: 47.604

[TRAINER] weights saved!


                                                                                                  

Recovery epoch [9/10]: Avg Loss: 2.5139 | Avg Accuracy: 40.63 | Model Sparsity: 0.99
Avg Perplexity: 48.296



                                                                                                   

Recovery epoch [10/10]: Avg Loss: 2.4914 | Avg Accuracy: 40.63 | Model Sparsity: 0.99
Avg Perplexity: 49.430

[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_wanda_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     41.03%
  Perplexity:   48.119

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         wanda_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





## BaCP Accuracies

### Magnitude Pruning

In [None]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

### SNIP-it Prune

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

### WandA Prune

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)