In [3]:
from transformers import AutoModelForSeq2SeqLM

In [4]:
from TALib import TALib

In [25]:
model_t5 =  AutoModelForSeq2SeqLM.from_pretrained(TALib.CHECKPOINT)

In [26]:
model_t5

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [27]:
# module_encoder = model_t5.encoder 
# module_decoder = model_t5.decoder 

In [5]:
import torch
from torch.nn.utils import prune

In [29]:
parameters_to_prune = []
for _, module in model_t5.named_modules():
    if isinstance(module, torch.nn.Linear):
        parameters_to_prune.append((module, "weight"))
        


In [30]:
print(parameters_to_prune)

[(Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=2048, bias=False), 'weight'), (Linear(in_features=2048, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=2048, bias=False), 'weight'), (Linear(in_features=2048, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), '

In [31]:
        
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.7,
)

In [32]:
print(model_t5)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [33]:
# print(sum(torch.nn.utils.parameters_to_vector(model_t5.buffers()) == 0))
TALib.show_param_ratio(model=model_t5)

0.30019545555114746

In [34]:
pruned_model = model_t5

In [35]:
# TALib.save_model(pruned_model , "output/try_pruning_0.3")

In [36]:
# reload_model = TALib.load_model("output/try_pruning_0.3")

In [37]:
# TALib.show_param_ratio(model=reload_model)

## test

In [6]:
from datasets  import load_dataset

billsum = load_dataset("billsum", split="train")

In [7]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq , AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
tokenizer = AutoTokenizer.from_pretrained(TALib.TK_ckpt)

In [8]:
preprocess_function = TALib.preprocess_function_pass_tokenizer(tokenizer)

In [9]:
billsum = billsum.train_test_split(test_size=0.2)
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map:   0%|          | 0/3790 [00:00<?, ? examples/s]

## Data Handler

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=TALib.CHECKPOINT)

In [11]:
import evaluate

rouge = evaluate.load("rouge")

In [44]:
from rich import print

In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir="TA_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=4,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,
)

compute_metrics = TALib.compute_metrics_pass_tokenizer(tokenizer)

trainer = Seq2SeqTrainer(
    model=pruned_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [46]:
ratio = TALib.show_param_ratio(model=pruned_model)


In [47]:
print(ratio)

In [12]:
billsum_test = load_dataset("billsum", split="test")
tokenized_billsum_test = billsum_test.map(preprocess_function, batched=True)

In [49]:
trainer.evaluate(tokenized_billsum_test)



{'eval_loss': 9.338435173034668,
 'eval_rouge1': 0.0,
 'eval_rouge2': 0.0,
 'eval_rougeL': 0.0,
 'eval_rougeLsum': 0.0,
 'eval_gen_len': 19.0,
 'eval_runtime': 812.828,
 'eval_samples_per_second': 4.022,
 'eval_steps_per_second': 2.011}

In [50]:
results = trainer.predict(tokenized_billsum_test)

In [51]:
decoded_prediction = tokenizer.batch_decode(results[0], skip_special_tokens=True)


In [52]:
TALib.dump_to_kaggle_format(decoded_prediction , 'pruned_model_0.3.csv')

Unnamed: 0,ID,Predict
0,0,
1,1,-
2,2,-
3,3,-
4,4,-
...,...,...
3264,3264,
3265,3265,
3266,3266,-
3267,3267,-


In [53]:
final_score = TALib.run_score(predict=decoded_prediction,label=billsum_test)

In [54]:
print(final_score)

In [55]:
batch_size = 15

In [56]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output/pruning",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=1,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,

)

compute_metrics = TALib.compute_metrics_pass_tokenizer(tokenizer)

trainer = Seq2SeqTrainer(
    model=pruned_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [57]:
ratio = TALib.show_param_ratio(model=pruned_model)

In [58]:
print(ratio)

In [59]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,7.1449,6.894363,0.0001,0.0,0.0001,0.0001,19.0




TrainOutput(global_step=1011, training_loss=7.510942801995282, metrics={'train_runtime': 21188.2798, 'train_samples_per_second': 0.715, 'train_steps_per_second': 0.048, 'total_flos': 4103292737028096.0, 'train_loss': 7.510942801995282, 'epoch': 1.0})

In [61]:
TALib.save_model(pruned_model , "output/try_pruning_0.3")

In [2]:
from TALib import TALib

In [13]:
reload_model = TALib.load_model("output/try_pruning_0.3")

Some weights of the model checkpoint at output/try_pruning_0.3 were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.0.SelfAttention.k.weight_mask', 'decoder.block.0.layer.0.SelfAttention.k.weight_orig', 'decoder.block.0.layer.0.SelfAttention.o.weight_mask', 'decoder.block.0.layer.0.SelfAttention.o.weight_orig', 'decoder.block.0.layer.0.SelfAttention.q.weight_mask', 'decoder.block.0.layer.0.SelfAttention.q.weight_orig', 'decoder.block.0.layer.0.SelfAttention.v.weight_mask', 'decoder.block.0.layer.0.SelfAttention.v.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.k.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.k.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.o.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.o.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.q.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.q.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.v.weight_mask', 'decoder.block.0.layer.1.EncDecAttent

In [14]:
TALib.show_param_ratio(reload_model)

0.30019545555114746

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="TA_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=4,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,
)

compute_metrics = TALib.compute_metrics_pass_tokenizer(tokenizer)

trainer = Seq2SeqTrainer(
    model=reload_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.evaluate(tokenized_billsum_test)



{'eval_loss': 6.906755447387695,
 'eval_rouge1': 0.0001,
 'eval_rouge2': 0.0,
 'eval_rougeL': 0.0001,
 'eval_rougeLsum': 0.0001,
 'eval_gen_len': 19.0,
 'eval_runtime': 854.5725,
 'eval_samples_per_second': 3.825,
 'eval_steps_per_second': 1.913}

In [17]:
results = trainer.predict(tokenized_billsum_test)

In [18]:
decoded_prediction = tokenizer.batch_decode(results[0], skip_special_tokens=True)

In [19]:
TALib.dump_to_kaggle_format(decoded_prediction , 'pruned_model_0.3_fine_tune.csv')

Unnamed: 0,ID,Predict
0,0,-
1,1,-
2,2,-
3,3,-
4,4,-
...,...,...
3264,3264,
3265,3265,-
3266,3266,-
3267,3267,-


In [21]:
TALib.run_score(decoded_prediction , billsum_test)

0.005192334416469477