In [1]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [2]:
import os
import sys
sys.path.append(os.path.abspath('..'))

from constants import (
    TARGET_SPARSITY_LOW, TARGET_SPARSITY_MID, TARGET_SPARSITY_HIGH,
    BATCH_SIZE_CNN, BATCH_SIZE_VIT, BATCH_SIZE_LLM,
    EPOCHS_SMALL_MODEL, EPOCHS_LARGE_MODEL, EPOCHS_VIT
)
from utils import get_device, get_num_workers, load_weights, print_statistics
from unstructured_pruning import check_model_sparsity, check_sparsity_distribution
from trainer import TrainingArguments, Trainer
from bacp import BaCPTrainingArguments, BaCPTrainer

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
os.environ["HF_DATASETS_CACHE"] = "/dbfs/hf_datasets"
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DEVICE = get_device()
NUM_WORKERS = get_num_workers()
print("Using device:", DEVICE)
print("Using", NUM_WORKERS, "workers")

Using device: cuda
Using 288 workers


# DistilBERT

In [4]:
MODEL_NAME = "distilbert-base-uncased"
MODEL_TASK = "wikitext2"
TRAIN = True

## Baseline Accuracies

In [9]:
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-5),
    scheduler_type='linear_with_warmup',
    epochs=50,
    learning_type="baseline",
    db=False,
)
trainer = Trainer(training_args=training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Optimizer type w/ learning rate: (adamw, 5e-05)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] Linear scheduler initialized with warmup steps: 355 and total steps: 3550
[TRAINER] Pruning not initialized
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.0


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     64.00%
  Perplexity:   5.743

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.0000 (0.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        baseline
  Batch Size:           64
  Learning Rate:        5e-05
  Optimizer:            adamw
  Epochs:               50

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





## Pruning Accuracies

### Magnitude Prune

In [21]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if False:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     47.98%
  Perplexity:   27.106

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [10]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.97
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.97_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     45.93%
  Perplexity:   30.643

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [11]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     41.23%
  Perplexity:   45.285

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         magnitude_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





### SNIP-it Prune

In [12]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.95_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.95_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     45.38%
  Perplexity:   34.360

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [13]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.97
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.97_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.97_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.97


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     42.39%
  Perplexity:   39.963

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [14]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Pruning initialized
[TRAINER] Pruning type: snip_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.99_pruning.pt
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_snip_pruning_0.99_pruning.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.99


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     37.65%
  Perplexity:   64.528

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         snip_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





### WandA Prune

In [15]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
.weight
model.weight
model.activation.weight
model.distilbert.weight
model.distilbert.embeddings.weight
model.distilbert.embeddings.word_embeddings.weight
model.distilbert.embeddings.position_embeddings.weight
model.distilbert.embeddings.LayerNorm.weight
model.distilbert.embeddings.dropout.weight
model.distilbert.transformer.weight
model.distilbert.transformer.layer.weight
model.distilbert.transformer.layer.0.weight
model.distilbert.transformer.layer.0.attention.weight
model.distilbert.transformer.layer.0.attention.dropout.weight
model.distilbert.transformer.layer.0.attention.q_lin.weight

                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     45.70%
  Perplexity:   34.683

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         wanda_pruning
  Target Sparsity:      0.95
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [18]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
.weight
model.weight
model.activation.weight
model.distilbert.weight
model.distilbert.embeddings.weight
model.distilbert.embeddings.word_embeddings.weight
model.distilbert.embeddings.position_embeddings.weight
model.distilbert.embeddings.LayerNorm.weight
model.distilbert.embeddings.dropout.weight
model.distilbert.transformer.weight
model.distilbert.transformer.layer.weight
model.distilbert.transformer.layer.0.weight
model.distilbert.transformer.layer.0.attention.weight
model.distilbert.transformer.layer.0.attention.dropout.weight
model.distilbert.transformer.layer.0.attention.q_lin.weight

                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     43.57%
  Perplexity:   38.139

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9700 (97.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         wanda_pruning
  Target Sparsity:      0.97
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [17]:
# Initializing finetuned weights path
finetuned_weights = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"
training_args = TrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,
    optimizer_type_and_lr=('adamw', 5e-4),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=finetuned_weights,
    learning_type="pruning",
    db=False,
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_baseline.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.0005)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
.weight
model.weight
model.activation.weight
model.distilbert.weight
model.distilbert.embeddings.weight
model.distilbert.embeddings.word_embeddings.weight
model.distilbert.embeddings.position_embeddings.weight
model.distilbert.embeddings.LayerNorm.weight
model.distilbert.embeddings.dropout.weight
model.distilbert.transformer.weight
model.distilbert.transformer.layer.weight
model.distilbert.transformer.layer.0.weight
model.distilbert.transformer.layer.0.attention.weight
model.distilbert.transformer.layer.0.attention.dropout.weight
model.distilbert.transformer.layer.0.attention.q_lin.weight

                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     40.85%
  Perplexity:   48.597

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9900 (99.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        pruning
  Batch Size:           64
  Learning Rate:        0.0005
  Optimizer:            adamw
  Epochs:               5

Pruning Configuration:
------------------------------
  Pruning Type:         wanda_pruning
  Target Sparsity:      0.99
  Sparsity Scheduler:   cubic
  Recovery Epochs:      10

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





## BaCP Accuracies

### Magnitude Pruning

In [5]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Weights loaded successfully
[TRAINER] Initialized BaCP models
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.001)
[TRAINER] No scheduler initialized
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.95
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_bacp_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/bacp_pruning/magnitude_pruning/0.95/run_4.log


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.464.



                                                                                                                                        

Epoch [1/5]: Avg Total Loss: 7.6227 | Avg PrC Loss: 2.6096 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.4856 | Avg CE Loss: 2.5275 | Model Sparsity: 0.4636

[BaCP] weights saved!


                                                                                                                                           

Retraining Epoch [1/10]: Avg Total Loss: 6.6930 | Avg PrC Loss: 2.2470 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0331 | Avg CE Loss: 2.4129 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [2/10]: Avg Total Loss: 6.5663 | Avg PrC Loss: 2.2379 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0104 | Avg CE Loss: 2.3179 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [3/10]: Avg Total Loss: 6.4671 | Avg PrC Loss: 2.2390 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0061 | Avg CE Loss: 2.2221 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [4/10]: Avg Total Loss: 6.3978 | Avg PrC Loss: 2.2537 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0184 | Avg CE Loss: 2.1258 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [5/10]: Avg Total Loss: 6.3208 | Avg PrC Loss: 2.2600 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0232 | Avg CE Loss: 2.0376 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [6/10]: Avg Total Loss: 6.2463 | Avg PrC Loss: 2.2672 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0282 | Avg CE Loss: 1.9510 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [7/10]: Avg Total Loss: 6.1967 | Avg PrC Loss: 2.2791 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0397 | Avg CE Loss: 1.8780 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [8/10]: Avg Total Loss: 6.1418 | Avg PrC Loss: 2.2837 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0428 | Avg CE Loss: 1.8153 | Model Sparsity: 0.4636



                                                                                                                                           

Retraining Epoch [9/10]: Avg Total Loss: 6.0877 | Avg PrC Loss: 2.2815 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0392 | Avg CE Loss: 1.7670 | Model Sparsity: 0.4636



                                                                                                                                            

Retraining Epoch [10/10]: Avg Total Loss: 6.0504 | Avg PrC Loss: 2.2813 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0373 | Avg CE Loss: 1.7317 | Model Sparsity: 0.4636



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.745.



                                                                                                                                           

Epoch [2/5]: Avg Total Loss: 9.6940 | Avg PrC Loss: 2.9796 | Avg SnC Loss: 1.9367 | Avg FiC Loss: 2.9129 | Avg CE Loss: 1.8648 | Model Sparsity: 0.7448

[BaCP] weights saved!


                                                                                                                                              

Retraining Epoch [1/10]: Avg Total Loss: 8.4258 | Avg PrC Loss: 2.5940 | Avg SnC Loss: 1.6468 | Avg FiC Loss: 2.4622 | Avg CE Loss: 1.7228 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [2/10]: Avg Total Loss: 8.2697 | Avg PrC Loss: 2.5431 | Avg SnC Loss: 1.6452 | Avg FiC Loss: 2.3951 | Avg CE Loss: 1.6863 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [3/10]: Avg Total Loss: 8.1903 | Avg PrC Loss: 2.5228 | Avg SnC Loss: 1.6447 | Avg FiC Loss: 2.3638 | Avg CE Loss: 1.6590 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [4/10]: Avg Total Loss: 8.1185 | Avg PrC Loss: 2.5039 | Avg SnC Loss: 1.6416 | Avg FiC Loss: 2.3377 | Avg CE Loss: 1.6353 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [5/10]: Avg Total Loss: 8.0666 | Avg PrC Loss: 2.4910 | Avg SnC Loss: 1.6392 | Avg FiC Loss: 2.3188 | Avg CE Loss: 1.6175 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [6/10]: Avg Total Loss: 8.0210 | Avg PrC Loss: 2.4823 | Avg SnC Loss: 1.6371 | Avg FiC Loss: 2.3055 | Avg CE Loss: 1.5961 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [7/10]: Avg Total Loss: 7.9895 | Avg PrC Loss: 2.4765 | Avg SnC Loss: 1.6402 | Avg FiC Loss: 2.2969 | Avg CE Loss: 1.5760 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [8/10]: Avg Total Loss: 7.9619 | Avg PrC Loss: 2.4712 | Avg SnC Loss: 1.6372 | Avg FiC Loss: 2.2885 | Avg CE Loss: 1.5650 | Model Sparsity: 0.7448



                                                                                                                                              

Retraining Epoch [9/10]: Avg Total Loss: 7.9268 | Avg PrC Loss: 2.4643 | Avg SnC Loss: 1.6323 | Avg FiC Loss: 2.2793 | Avg CE Loss: 1.5509 | Model Sparsity: 0.7448



                                                                                                                                               

Retraining Epoch [10/10]: Avg Total Loss: 7.8955 | Avg PrC Loss: 2.4564 | Avg SnC Loss: 1.6299 | Avg FiC Loss: 2.2700 | Avg CE Loss: 1.5392 | Model Sparsity: 0.7448



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.889.



                                                                                                                                           

Epoch [3/5]: Avg Total Loss: 11.6658 | Avg PrC Loss: 3.0429 | Avg SnC Loss: 3.8011 | Avg FiC Loss: 2.9977 | Avg CE Loss: 1.8241 | Model Sparsity: 0.8892

[BaCP] weights saved!


                                                                                                                                              

Retraining Epoch [1/10]: Avg Total Loss: 10.3376 | Avg PrC Loss: 2.7680 | Avg SnC Loss: 3.2002 | Avg FiC Loss: 2.6811 | Avg CE Loss: 1.6882 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [2/10]: Avg Total Loss: 10.1564 | Avg PrC Loss: 2.7234 | Avg SnC Loss: 3.1591 | Avg FiC Loss: 2.6244 | Avg CE Loss: 1.6495 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [3/10]: Avg Total Loss: 10.0532 | Avg PrC Loss: 2.6984 | Avg SnC Loss: 3.1386 | Avg FiC Loss: 2.5912 | Avg CE Loss: 1.6250 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [4/10]: Avg Total Loss: 9.9922 | Avg PrC Loss: 2.6845 | Avg SnC Loss: 3.1322 | Avg FiC Loss: 2.5720 | Avg CE Loss: 1.6034 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [5/10]: Avg Total Loss: 9.9364 | Avg PrC Loss: 2.6706 | Avg SnC Loss: 3.1243 | Avg FiC Loss: 2.5535 | Avg CE Loss: 1.5880 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [6/10]: Avg Total Loss: 9.8877 | Avg PrC Loss: 2.6588 | Avg SnC Loss: 3.1145 | Avg FiC Loss: 2.5383 | Avg CE Loss: 1.5762 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [7/10]: Avg Total Loss: 9.8521 | Avg PrC Loss: 2.6521 | Avg SnC Loss: 3.1073 | Avg FiC Loss: 2.5281 | Avg CE Loss: 1.5647 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [8/10]: Avg Total Loss: 9.8123 | Avg PrC Loss: 2.6404 | Avg SnC Loss: 3.1022 | Avg FiC Loss: 2.5157 | Avg CE Loss: 1.5540 | Model Sparsity: 0.8892



                                                                                                                                              

Retraining Epoch [9/10]: Avg Total Loss: 9.8017 | Avg PrC Loss: 2.6414 | Avg SnC Loss: 3.1042 | Avg FiC Loss: 2.5131 | Avg CE Loss: 1.5430 | Model Sparsity: 0.8892



                                                                                                                                               

Retraining Epoch [10/10]: Avg Total Loss: 9.7609 | Avg PrC Loss: 2.6313 | Avg SnC Loss: 3.0928 | Avg FiC Loss: 2.5018 | Avg CE Loss: 1.5350 | Model Sparsity: 0.8892



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.942.



                                                                                                                                           

Epoch [4/5]: Avg Total Loss: 13.1588 | Avg PrC Loss: 3.0512 | Avg SnC Loss: 5.3616 | Avg FiC Loss: 2.9993 | Avg CE Loss: 1.7467 | Model Sparsity: 0.9424

[BaCP] weights saved!


                                                                                                                                              

Retraining Epoch [1/10]: Avg Total Loss: 12.0958 | Avg PrC Loss: 2.8494 | Avg SnC Loss: 4.8206 | Avg FiC Loss: 2.7748 | Avg CE Loss: 1.6509 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [2/10]: Avg Total Loss: 11.9648 | Avg PrC Loss: 2.8189 | Avg SnC Loss: 4.7855 | Avg FiC Loss: 2.7370 | Avg CE Loss: 1.6233 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [3/10]: Avg Total Loss: 11.8718 | Avg PrC Loss: 2.7972 | Avg SnC Loss: 4.7499 | Avg FiC Loss: 2.7122 | Avg CE Loss: 1.6125 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [4/10]: Avg Total Loss: 11.8089 | Avg PrC Loss: 2.7847 | Avg SnC Loss: 4.7321 | Avg FiC Loss: 2.6959 | Avg CE Loss: 1.5962 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [5/10]: Avg Total Loss: 11.7529 | Avg PrC Loss: 2.7726 | Avg SnC Loss: 4.7146 | Avg FiC Loss: 2.6810 | Avg CE Loss: 1.5847 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [6/10]: Avg Total Loss: 11.7119 | Avg PrC Loss: 2.7649 | Avg SnC Loss: 4.7017 | Avg FiC Loss: 2.6714 | Avg CE Loss: 1.5738 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [7/10]: Avg Total Loss: 11.6647 | Avg PrC Loss: 2.7541 | Avg SnC Loss: 4.6843 | Avg FiC Loss: 2.6588 | Avg CE Loss: 1.5675 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [8/10]: Avg Total Loss: 11.6485 | Avg PrC Loss: 2.7518 | Avg SnC Loss: 4.6839 | Avg FiC Loss: 2.6545 | Avg CE Loss: 1.5583 | Model Sparsity: 0.9424



                                                                                                                                              

Retraining Epoch [9/10]: Avg Total Loss: 11.6340 | Avg PrC Loss: 2.7508 | Avg SnC Loss: 4.6845 | Avg FiC Loss: 2.6507 | Avg CE Loss: 1.5481 | Model Sparsity: 0.9424



                                                                                                                                               

Retraining Epoch [10/10]: Avg Total Loss: 11.5911 | Avg PrC Loss: 2.7435 | Avg SnC Loss: 4.6637 | Avg FiC Loss: 2.6417 | Avg CE Loss: 1.5422 | Model Sparsity: 0.9424



Training Epoch [5/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.950.



                                                                                                                                           

Epoch [5/5]: Avg Total Loss: 13.3861 | Avg PrC Loss: 2.7970 | Avg SnC Loss: 6.3221 | Avg FiC Loss: 2.7087 | Avg CE Loss: 1.5583 | Model Sparsity: 0.95

[BaCP] weights saved!


                                                                                                                                              

Retraining Epoch [1/10]: Avg Total Loss: 13.2747 | Avg PrC Loss: 2.7793 | Avg SnC Loss: 6.2676 | Avg FiC Loss: 2.6864 | Avg CE Loss: 1.5415 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [2/10]: Avg Total Loss: 13.2405 | Avg PrC Loss: 2.7739 | Avg SnC Loss: 6.2537 | Avg FiC Loss: 2.6797 | Avg CE Loss: 1.5332 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [3/10]: Avg Total Loss: 13.2162 | Avg PrC Loss: 2.7712 | Avg SnC Loss: 6.2464 | Avg FiC Loss: 2.6755 | Avg CE Loss: 1.5231 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [4/10]: Avg Total Loss: 13.1851 | Avg PrC Loss: 2.7662 | Avg SnC Loss: 6.2286 | Avg FiC Loss: 2.6687 | Avg CE Loss: 1.5216 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [5/10]: Avg Total Loss: 13.1655 | Avg PrC Loss: 2.7643 | Avg SnC Loss: 6.2170 | Avg FiC Loss: 2.6663 | Avg CE Loss: 1.5179 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [6/10]: Avg Total Loss: 13.1446 | Avg PrC Loss: 2.7617 | Avg SnC Loss: 6.2113 | Avg FiC Loss: 2.6622 | Avg CE Loss: 1.5093 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [7/10]: Avg Total Loss: 13.1302 | Avg PrC Loss: 2.7595 | Avg SnC Loss: 6.2065 | Avg FiC Loss: 2.6592 | Avg CE Loss: 1.5049 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [8/10]: Avg Total Loss: 13.1022 | Avg PrC Loss: 2.7543 | Avg SnC Loss: 6.1930 | Avg FiC Loss: 2.6537 | Avg CE Loss: 1.5012 | Model Sparsity: 0.95



                                                                                                                                              

Retraining Epoch [9/10]: Avg Total Loss: 13.0828 | Avg PrC Loss: 2.7544 | Avg SnC Loss: 6.1826 | Avg FiC Loss: 2.6518 | Avg CE Loss: 1.4941 | Model Sparsity: 0.95



                                                                                                                                               

Retraining Epoch [10/10]: Avg Total Loss: 13.0830 | Avg PrC Loss: 2.7551 | Avg SnC Loss: 6.1827 | Avg FiC Loss: 2.6525 | Avg CE Loss: 1.4927 | Model Sparsity: 0.95

[BaCP] weights saved!
[BaCP TRAINER] Mask generated from current model.
[TRAINER] Image size: None
[TRAINER] Initialized models
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_bacp_pruning.pt
[TRAINER] Weights loaded
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.001)
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] No scheduler initialized
[TRAINER] Finetuning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Current sparsity: 0.9500
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_bacp_finetune.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased

                                                                                                  

Training epoch [1/50]: Avg Loss: 3.3557 | Avg Accuracy: 47.64 | Model Sparsity: 0.95
Avg Perplexity: 25.275

[TRAINER] weights saved!


                                                                                                  

Training epoch [2/50]: Avg Loss: 2.8343 | Avg Accuracy: 48.37 | Model Sparsity: 0.95
Avg Perplexity: 24.158

[TRAINER] weights saved!


                                                                                                  

Training epoch [3/50]: Avg Loss: 2.7228 | Avg Accuracy: 48.09 | Model Sparsity: 0.95
Avg Perplexity: 23.238



                                                                                                  

Training epoch [4/50]: Avg Loss: 2.6384 | Avg Accuracy: 48.58 | Model Sparsity: 0.95
Avg Perplexity: 21.999

[TRAINER] weights saved!


                                                                                                  

Training epoch [5/50]: Avg Loss: 2.5753 | Avg Accuracy: 49.02 | Model Sparsity: 0.95
Avg Perplexity: 21.284

[TRAINER] weights saved!


                                                                                                  

Training epoch [6/50]: Avg Loss: 2.5316 | Avg Accuracy: 48.75 | Model Sparsity: 0.95
Avg Perplexity: 21.728



                                                                                                  

Training epoch [7/50]: Avg Loss: 2.4776 | Avg Accuracy: 49.15 | Model Sparsity: 0.95
Avg Perplexity: 21.219

[TRAINER] weights saved!


                                                                                                  

Training epoch [8/50]: Avg Loss: 2.4404 | Avg Accuracy: 49.20 | Model Sparsity: 0.95
Avg Perplexity: 21.093

[TRAINER] weights saved!


                                                                                                  

Training epoch [9/50]: Avg Loss: 2.4044 | Avg Accuracy: 48.86 | Model Sparsity: 0.95
Avg Perplexity: 21.864



                                                                                                   

Training epoch [10/50]: Avg Loss: 2.3655 | Avg Accuracy: 48.83 | Model Sparsity: 0.95
Avg Perplexity: 21.368



                                                                                                   

Training epoch [11/50]: Avg Loss: 2.3406 | Avg Accuracy: 49.36 | Model Sparsity: 0.95
Avg Perplexity: 21.393

[TRAINER] weights saved!


                                                                                                   

Training epoch [12/50]: Avg Loss: 2.3112 | Avg Accuracy: 49.27 | Model Sparsity: 0.95
Avg Perplexity: 21.112



                                                                                                   

Training epoch [13/50]: Avg Loss: 2.2785 | Avg Accuracy: 49.99 | Model Sparsity: 0.95
Avg Perplexity: 20.232

[TRAINER] weights saved!


                                                                                                   

Training epoch [14/50]: Avg Loss: 2.2555 | Avg Accuracy: 49.32 | Model Sparsity: 0.95
Avg Perplexity: 21.656



                                                                                                   

Training epoch [15/50]: Avg Loss: 2.2323 | Avg Accuracy: 49.53 | Model Sparsity: 0.95
Avg Perplexity: 20.801



                                                                                                   

Training epoch [16/50]: Avg Loss: 2.2100 | Avg Accuracy: 49.49 | Model Sparsity: 0.95
Avg Perplexity: 21.482



                                                                                                   

Training epoch [17/50]: Avg Loss: 2.1874 | Avg Accuracy: 49.12 | Model Sparsity: 0.95
Avg Perplexity: 22.721



                                                                                                   

Training epoch [18/50]: Avg Loss: 2.1664 | Avg Accuracy: 49.07 | Model Sparsity: 0.95
Avg Perplexity: 22.131



                                                                                                   

Training epoch [19/50]: Avg Loss: 2.1383 | Avg Accuracy: 49.16 | Model Sparsity: 0.95
Avg Perplexity: 22.012



                                                                                                   

Training epoch [20/50]: Avg Loss: 2.1337 | Avg Accuracy: 49.67 | Model Sparsity: 0.95
Avg Perplexity: 20.775



                                                                                                   

Training epoch [21/50]: Avg Loss: 2.1021 | Avg Accuracy: 48.74 | Model Sparsity: 0.95
Avg Perplexity: 22.685



                                                                                                   

Training epoch [22/50]: Avg Loss: 2.0951 | Avg Accuracy: 49.60 | Model Sparsity: 0.95
Avg Perplexity: 21.518



                                                                                                   

Training epoch [23/50]: Avg Loss: 2.0779 | Avg Accuracy: 49.70 | Model Sparsity: 0.95
Avg Perplexity: 21.820



                                                                                                   

Training epoch [24/50]: Avg Loss: 2.0626 | Avg Accuracy: 48.91 | Model Sparsity: 0.95
Avg Perplexity: 23.203



                                                                                                   

Training epoch [25/50]: Avg Loss: 2.0437 | Avg Accuracy: 49.49 | Model Sparsity: 0.95
Avg Perplexity: 21.999



                                                                                                   

Training epoch [26/50]: Avg Loss: 2.0292 | Avg Accuracy: 48.52 | Model Sparsity: 0.95
Avg Perplexity: 23.764



                                                                                                   

Training epoch [27/50]: Avg Loss: 2.0138 | Avg Accuracy: 49.62 | Model Sparsity: 0.95
Avg Perplexity: 22.143



                                                                                                   

Training epoch [28/50]: Avg Loss: 2.0021 | Avg Accuracy: 49.24 | Model Sparsity: 0.95
Avg Perplexity: 23.723



                                                                                                   

Training epoch [29/50]: Avg Loss: 1.9758 | Avg Accuracy: 49.37 | Model Sparsity: 0.95
Avg Perplexity: 23.509



                                                                                                   

Training epoch [30/50]: Avg Loss: 1.9709 | Avg Accuracy: 49.11 | Model Sparsity: 0.95
Avg Perplexity: 23.790



                                                                                                   

Training epoch [31/50]: Avg Loss: 1.9564 | Avg Accuracy: 49.43 | Model Sparsity: 0.95
Avg Perplexity: 24.129



                                                                                                   

Training epoch [32/50]: Avg Loss: 1.9489 | Avg Accuracy: 49.23 | Model Sparsity: 0.95
Avg Perplexity: 23.957



                                                                                                   

Training epoch [33/50]: Avg Loss: 1.9179 | Avg Accuracy: 49.75 | Model Sparsity: 0.95
Avg Perplexity: 23.117

[TRAINER] Training stopped. No improvements for 20 epochs.
[TRAINER] Loading weights: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.95_bacp_finetune.pt
[TRAINER] Weights loaded successfully
[TRAINER] Model Sparsity: 0.95


                                                         


TRAINING STATISTICS SUMMARY

Performance Metrics:
------------------------------
  Accuracy:     49.79%
  Perplexity:   20.780

Model Information:
------------------------------
  Total Parameters:     66,985,530
  Trainable Parameters: 66,985,530
  Model Sparsity:       0.9500 (95.00%)

Training Configuration:
------------------------------
  Model:                distilbert-base-uncased
  Task:                 wikitext2
  Learning Type:        bacp_finetune
  Batch Size:           64
  Learning Rate:        0.001
  Optimizer:            adamw
  Epochs:               50

System Information:
------------------------------
  Device:               cuda
  Mixed Precision:      True
  Workers:              24





In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="magnitude_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

[TRAINER] Image size: None
[TRAINER] Weights loaded successfully
[TRAINER] Initialized BaCP models
[TRAINER] Optimizer type w/ learning rate: (adamw, 0.001)
[TRAINER] No scheduler initialized
[TRAINER] Data Initialized for model task: wikitext2
[TRAINER] Batch size: 64
[TRAINER] Number of dataloders: 3
[TRAINER] Pruning initialized
[TRAINER] Pruning type: magnitude_pruning
[TRAINER] Target sparsity: 0.99
[TRAINER] Sparsity scheduler: cubic
[TRAINER] Pruning epochs: 5
[TRAINER] Current sparsity: 0.0000
[TRAINER] Saving model to: ./research/distilbert-base-uncased/wikitext2/distilbert-base-uncased_wikitext2_magnitude_pruning_0.99_bacp_pruning.pt
[LOGGER] Log file created at location: ./log_records/distilbert-base-uncased/wikitext2/bacp_pruning/magnitude_pruning/0.99/run_1.log


Training Epoch [1/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.483.



                                                                                                                                        

Epoch [1/5]: Avg Total Loss: 7.6300 | Avg PrC Loss: 2.6036 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.4892 | Avg CE Loss: 2.5372 | Model Sparsity: 0.4831

[BaCP] weights saved!


                                                                                                                                           

Retraining Epoch [1/10]: Avg Total Loss: 6.7261 | Avg PrC Loss: 2.2543 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0442 | Avg CE Loss: 2.4276 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [2/10]: Avg Total Loss: 6.6043 | Avg PrC Loss: 2.2453 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0221 | Avg CE Loss: 2.3368 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [3/10]: Avg Total Loss: 6.5156 | Avg PrC Loss: 2.2511 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0215 | Avg CE Loss: 2.2430 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [4/10]: Avg Total Loss: 6.4194 | Avg PrC Loss: 2.2515 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0183 | Avg CE Loss: 2.1496 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [5/10]: Avg Total Loss: 6.3485 | Avg PrC Loss: 2.2651 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0301 | Avg CE Loss: 2.0533 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [6/10]: Avg Total Loss: 6.2753 | Avg PrC Loss: 2.2742 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0377 | Avg CE Loss: 1.9634 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [7/10]: Avg Total Loss: 6.2069 | Avg PrC Loss: 2.2795 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0411 | Avg CE Loss: 1.8863 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [8/10]: Avg Total Loss: 6.1610 | Avg PrC Loss: 2.2869 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0465 | Avg CE Loss: 1.8276 | Model Sparsity: 0.4831



                                                                                                                                           

Retraining Epoch [9/10]: Avg Total Loss: 6.1099 | Avg PrC Loss: 2.2844 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0440 | Avg CE Loss: 1.7814 | Model Sparsity: 0.4831



                                                                                                                                            

Retraining Epoch [10/10]: Avg Total Loss: 6.0632 | Avg PrC Loss: 2.2838 | Avg SnC Loss: 0.0000 | Avg FiC Loss: 2.0419 | Avg CE Loss: 1.7375 | Model Sparsity: 0.4831



Training Epoch [2/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.776.



                                                                                                                                           

Epoch [2/5]: Avg Total Loss: 10.1804 | Avg PrC Loss: 3.1152 | Avg SnC Loss: 2.0802 | Avg FiC Loss: 3.0693 | Avg CE Loss: 1.9157 | Model Sparsity: 0.7762

[BaCP] weights saved!


                                                                                                                                              

Retraining Epoch [1/10]: Avg Total Loss: 8.5710 | Avg PrC Loss: 2.6442 | Avg SnC Loss: 1.6421 | Avg FiC Loss: 2.5318 | Avg CE Loss: 1.7529 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [2/10]: Avg Total Loss: 8.4014 | Avg PrC Loss: 2.5889 | Avg SnC Loss: 1.6416 | Avg FiC Loss: 2.4596 | Avg CE Loss: 1.7113 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [3/10]: Avg Total Loss: 8.2973 | Avg PrC Loss: 2.5584 | Avg SnC Loss: 1.6383 | Avg FiC Loss: 2.4173 | Avg CE Loss: 1.6833 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [4/10]: Avg Total Loss: 8.2193 | Avg PrC Loss: 2.5376 | Avg SnC Loss: 1.6347 | Avg FiC Loss: 2.3884 | Avg CE Loss: 1.6586 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [5/10]: Avg Total Loss: 8.1614 | Avg PrC Loss: 2.5236 | Avg SnC Loss: 1.6314 | Avg FiC Loss: 2.3664 | Avg CE Loss: 1.6401 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [6/10]: Avg Total Loss: 8.1246 | Avg PrC Loss: 2.5159 | Avg SnC Loss: 1.6339 | Avg FiC Loss: 2.3561 | Avg CE Loss: 1.6187 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [7/10]: Avg Total Loss: 8.0728 | Avg PrC Loss: 2.5031 | Avg SnC Loss: 1.6302 | Avg FiC Loss: 2.3383 | Avg CE Loss: 1.6011 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [8/10]: Avg Total Loss: 8.0426 | Avg PrC Loss: 2.4969 | Avg SnC Loss: 1.6314 | Avg FiC Loss: 2.3302 | Avg CE Loss: 1.5842 | Model Sparsity: 0.7762



                                                                                                                                              

Retraining Epoch [9/10]: Avg Total Loss: 8.0034 | Avg PrC Loss: 2.4896 | Avg SnC Loss: 1.6267 | Avg FiC Loss: 2.3185 | Avg CE Loss: 1.5686 | Model Sparsity: 0.7762



                                                                                                                                               

Retraining Epoch [10/10]: Avg Total Loss: 7.9702 | Avg PrC Loss: 2.4819 | Avg SnC Loss: 1.6249 | Avg FiC Loss: 2.3099 | Avg CE Loss: 1.5535 | Model Sparsity: 0.7762



Training Epoch [3/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.927.



                                                                                                                                           

Epoch [3/5]: Avg Total Loss: 13.2674 | Avg PrC Loss: 3.3428 | Avg SnC Loss: 4.6134 | Avg FiC Loss: 3.3389 | Avg CE Loss: 1.9724 | Model Sparsity: 0.9266

[BaCP] weights saved!


                                                                                                                                              

Retraining Epoch [1/10]: Avg Total Loss: 11.0880 | Avg PrC Loss: 2.9558 | Avg SnC Loss: 3.4342 | Avg FiC Loss: 2.9097 | Avg CE Loss: 1.7883 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [2/10]: Avg Total Loss: 10.7464 | Avg PrC Loss: 2.8784 | Avg SnC Loss: 3.3134 | Avg FiC Loss: 2.8173 | Avg CE Loss: 1.7372 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [3/10]: Avg Total Loss: 10.5806 | Avg PrC Loss: 2.8383 | Avg SnC Loss: 3.2627 | Avg FiC Loss: 2.7685 | Avg CE Loss: 1.7111 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [4/10]: Avg Total Loss: 10.4775 | Avg PrC Loss: 2.8147 | Avg SnC Loss: 3.2402 | Avg FiC Loss: 2.7373 | Avg CE Loss: 1.6853 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [5/10]: Avg Total Loss: 10.3990 | Avg PrC Loss: 2.7950 | Avg SnC Loss: 3.2174 | Avg FiC Loss: 2.7142 | Avg CE Loss: 1.6724 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [6/10]: Avg Total Loss: 10.3342 | Avg PrC Loss: 2.7786 | Avg SnC Loss: 3.2025 | Avg FiC Loss: 2.6941 | Avg CE Loss: 1.6589 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [7/10]: Avg Total Loss: 10.2690 | Avg PrC Loss: 2.7631 | Avg SnC Loss: 3.1848 | Avg FiC Loss: 2.6764 | Avg CE Loss: 1.6446 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [8/10]: Avg Total Loss: 10.2589 | Avg PrC Loss: 2.7637 | Avg SnC Loss: 3.1908 | Avg FiC Loss: 2.6733 | Avg CE Loss: 1.6311 | Model Sparsity: 0.9266



                                                                                                                                              

Retraining Epoch [9/10]: Avg Total Loss: 10.2104 | Avg PrC Loss: 2.7523 | Avg SnC Loss: 3.1814 | Avg FiC Loss: 2.6592 | Avg CE Loss: 1.6175 | Model Sparsity: 0.9266



                                                                                                                                               

Retraining Epoch [10/10]: Avg Total Loss: 10.1689 | Avg PrC Loss: 2.7424 | Avg SnC Loss: 3.1697 | Avg FiC Loss: 2.6474 | Avg CE Loss: 1.6094 | Model Sparsity: 0.9266



Training Epoch [4/5]:   0%|          | 0/71 [00:00<?, ?it/s]


[Pruner] Cubic Sparsity ratio increased to 0.982.



                                                                                                                                           

Epoch [4/5]: Avg Total Loss: 19.2737 | Avg PrC Loss: 3.8603 | Avg SnC Loss: 9.3560 | Avg FiC Loss: 3.8744 | Avg CE Loss: 2.1830 | Model Sparsity: 0.9821

[BaCP] weights saved!


Retraining epoch [1/10]:  65%|██████▍   | 46/71 [00:35<00:19,  1.25it/s, Loss=1.91, PrC Loss=3.35, SnC Loss=6.32, FiC Loss=3.35, CE Loss=1.91]

### SNIP-it Prune

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="snip_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

### WandA Prune

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_LOW,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_MID,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)

In [0]:
trained_model_path = f"./research/{MODEL_NAME}/{MODEL_TASK}/{MODEL_NAME}_{MODEL_TASK}_baseline.pt"

bacp_training_args = BaCPTrainingArguments(
    model_name=MODEL_NAME,
    model_task=MODEL_TASK,
    batch_size=BATCH_SIZE_LLM,   
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type="wanda_pruning",
    target_sparsity=TARGET_SPARSITY_HIGH,
    sparsity_scheduler='cubic',
    finetuned_weights=trained_model_path,
    learning_type='bacp_pruning',
    db=False,
)
bacp_trainer = BaCPTrainer(bacp_training_args)
if TRAIN:
    bacp_trainer.train()
    
# Finetuning Phase
bacp_trainer.generate_mask_from_model()
training_args = TrainingArguments(
    model_name=bacp_trainer.model_name,
    model_task=bacp_trainer.model_task,
    batch_size=bacp_trainer.batch_size,
    optimizer_type_and_lr=('adamw', 1e-3),
    pruning_type=bacp_trainer.pruning_type,
    target_sparsity=bacp_trainer.target_sparsity,
    finetuned_weights=bacp_trainer.save_path,
    epochs=50,
    pruner=bacp_trainer.get_pruner(),
    finetune=True,
    learning_type="bacp_finetune",
    db=False
)
trainer = Trainer(training_args)
if TRAIN:
    trainer.train()

metrics = trainer.evaluate()
print_statistics(metrics, trainer)