In [1]:
from transformers import AutoModelForSeq2SeqLM

In [2]:
from TALib import TALib

In [3]:
model_t5 =  AutoModelForSeq2SeqLM.from_pretrained(TALib.CHECKPOINT)

In [4]:
model_t5

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [5]:
module_encoder = model_t5.encoder 
module_decoder = model_t5.decoder 

In [6]:
import torch
from torch.nn.utils import prune

In [7]:
parameters_to_prune = []
for _, module in model_t5.named_modules():
    if isinstance(module, torch.nn.Linear):
        parameters_to_prune.append((module, "weight"))
        


In [8]:
print(parameters_to_prune)

[(Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=2048, bias=False), 'weight'), (Linear(in_features=2048, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=2048, bias=False), 'weight'), (Linear(in_features=2048, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), 'weight'), (Linear(in_features=512, out_features=512, bias=False), '

In [9]:
        
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.7,
)

In [10]:
print(model_t5)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [13]:
# print(sum(torch.nn.utils.parameters_to_vector(model_t5.buffers()) == 0))
TALib.show_param_ratio(model=model_t5)

0.30019545555114746

In [14]:
pruned_model = model_t5

## test

In [16]:
from datasets  import load_dataset

billsum = load_dataset("billsum", split="train")

In [17]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq , AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
tokenizer = AutoTokenizer.from_pretrained(TALib.TK_ckpt)

In [18]:
preprocess_function = TALib.preprocess_function_pass_tokenizer(tokenizer)

In [19]:
billsum = billsum.train_test_split(test_size=0.2)
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map:   0%|          | 0/3790 [00:00<?, ? examples/s]

## Data Handler

In [20]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=TALib.CHECKPOINT)

In [21]:
import evaluate

rouge = evaluate.load("rouge")

In [22]:
from rich import print

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="TA_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=4,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,
)

compute_metrics = TALib.compute_metrics_pass_tokenizer(tokenizer)

trainer = Seq2SeqTrainer(
    model=pruned_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [25]:
ratio = TALib.show_param_ratio(model=pruned_model)


In [26]:
print(ratio)

In [27]:
billsum_test = load_dataset("billsum", split="test")
tokenized_billsum_test = billsum_test.map(preprocess_function, batched=True)

In [28]:
trainer.evaluate(tokenized_billsum_test)



{'eval_loss': 9.338435173034668,
 'eval_rouge1': 0.0,
 'eval_rouge2': 0.0,
 'eval_rougeL': 0.0,
 'eval_rougeLsum': 0.0,
 'eval_gen_len': 19.0,
 'eval_runtime': 850.5612,
 'eval_samples_per_second': 3.843,
 'eval_steps_per_second': 1.922}

In [29]:
results = trainer.predict(tokenized_billsum_test)



In [30]:
decoded_prediction = tokenizer.batch_decode(results[0], skip_special_tokens=True)


In [35]:
import csv
def dump_to_kaggle_format(decoded_prediction: list[str], filename: str):
        df_results = TALib.to_ta_kaggle_format(decoded_prediction)

        # Function to escape double quotes and handle newlines
        def escape_special_characters(text):
            return text.replace('"', '""').replace("\n", " ")

        # Apply escaping to the 'Summary' column
        df_results["Predict"] = df_results["Predict"].apply(escape_special_characters)

        df_results.to_csv(
            filename,
            index=False,
            quoting=csv.QUOTE_ALL,
            encoding="utf-8",
        )

        return df_results

In [36]:
TALib.dump_to_kaggle_format(decoded_prediction , 'pruned_model_0.3.csv')

Unnamed: 0,ID,Predict
0,0,
1,1,-
2,2,-
3,3,-
4,4,-
...,...,...
3264,3264,
3265,3265,
3266,3266,-
3267,3267,-


In [32]:
final_score = TALib.run_score(predict=decoded_prediction,label=billsum_test)

In [33]:
print(final_score)

In [43]:
batch_size = 15

In [44]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output/pruning",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=1,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,

)

compute_metrics = TALib.compute_metrics_pass_tokenizer(tokenizer)

trainer = Seq2SeqTrainer(
    model=pruned_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [45]:
ratio = TALib.show_param_ratio(model=pruned_model)

In [46]:
print(ratio)

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 