In [1]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer , T5ForConditionalGeneration,Seq2SeqTrainer,Seq2SeqTrainingArguments
from transformers.models.t5.modeling_t5 import T5LayerSelfAttention , T5Attention

In [2]:
from TALib import TALib

In [3]:
ta_lib = TALib()

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map:   0%|          | 0/3790 [00:00<?, ? examples/s]

In [4]:
# tokenizer = AutoTokenizer.from_pretrained(TALib.TK_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(TALib.CHECKPOINT)

In [5]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
TALib.show_param_ratio(model)

1.0

In [7]:
trainer = ta_lib.get_trainer(model=model , num_train_epochs=15 , batch_size=5 , output_dir="KY_billsum_model") 

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [8]:
trainer.evaluate(ta_lib.tokenized_billsum_test)



{'eval_loss': 1.49580979347229,
 'eval_rouge1': 0.241,
 'eval_rouge2': 0.1962,
 'eval_rougeL': 0.2333,
 'eval_rougeLsum': 0.2334,
 'eval_gen_len': 18.9997,
 'eval_runtime': 371.6962,
 'eval_samples_per_second': 8.795,
 'eval_steps_per_second': 1.76}

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.717,1.461313,0.2435,0.1977,0.2362,0.2361,19.0
2,1.6281,1.441524,0.2438,0.1984,0.2364,0.2363,18.9995
3,1.4395,1.427871,0.2442,0.1994,0.237,0.2369,18.9995
4,1.5677,1.410927,0.2446,0.2001,0.2375,0.2374,18.9995
5,1.6117,1.403012,0.2448,0.2005,0.2376,0.2375,19.0
6,1.6658,1.393306,0.245,0.201,0.238,0.2379,19.0
7,1.3804,1.387831,0.2454,0.2013,0.2384,0.2383,19.0
8,1.4932,1.381272,0.2453,0.2012,0.2382,0.2382,19.0
9,1.3484,1.376467,0.2452,0.2014,0.2382,0.2381,18.9995
10,1.4622,1.372788,0.2455,0.2019,0.2387,0.2386,19.0




TrainOutput(global_step=45480, training_loss=1.5199361290340374, metrics={'train_runtime': 18094.4938, 'train_samples_per_second': 12.567, 'train_steps_per_second': 2.513, 'total_flos': 6.154939105542144e+16, 'train_loss': 1.5199361290340374, 'epoch': 15.0})

In [10]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 1.3625977039337158,
 'eval_rouge1': 0.2448,
 'eval_rouge2': 0.2024,
 'eval_rougeL': 0.2379,
 'eval_rougeLsum': 0.2379,
 'eval_gen_len': 19.0,
 'eval_runtime': 380.2154,
 'eval_samples_per_second': 8.598,
 'eval_steps_per_second': 1.72,
 'epoch': 15.0}

In [11]:
import torch

import torch.nn.utils.prune as prune

In [12]:
from transformers.models.t5.modeling_t5 import T5LayerSelfAttention, T5LayerCrossAttention, T5LayerFF

In [13]:
parameters_to_prune = {"self_attention" : [] , "cross_attention":[] , "ffn":[] , "lm_head":[]}
for name, module in model.named_modules():
    # print(name , type(module))
    if isinstance(module ,T5LayerSelfAttention ):
        # print("SelfAttention " , module)
        
        for name_2 , item in module.named_modules():
            if isinstance(item , torch.nn.Linear):
                parameters_to_prune["self_attention"].append((item , "weight"))
                
    if isinstance(module ,T5LayerCrossAttention ):
        # print("CrossAttention " , module)
        
        for name_2 , item in module.named_modules():
            if isinstance(item , torch.nn.Linear):
                parameters_to_prune["cross_attention"].append((item , "weight"))
                
    if isinstance(module ,T5LayerFF ):
        # print("FFN " , module)
        
        for name_2 , item in module.named_modules():
            if isinstance(item , torch.nn.Linear):
                parameters_to_prune["ffn"].append((item , "weight"))
                
    if isinstance(module ,torch.nn.Linear ) and name == "lm_head":
        parameters_to_prune["lm_head"].append((module , "weight"))
        
    # if isinstance(module, torch.nn.Linear):
    #     parameters_to_prune.append((module, "weight"))


In [14]:
parameters_to_prune

{'self_attention': [(Linear(in_features=512, out_features=512, bias=False),
   'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_features=512, out_features=512, bias=False), 'weight'),
  (Linear(in_

In [15]:
self_attention_prune_amount = 0.9
cross_attention_prune_amount = 0.4
ff_prune_amount = 0.7
lm_head_amount = 0.8

In [16]:
trainer = ta_lib.get_trainer(model=model )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
def prune_model():
    prune.global_unstructured(
        parameters_to_prune["self_attention"],
        pruning_method=prune.L1Unstructured,
        amount=self_attention_prune_amount,
    )
    prune.global_unstructured(
        parameters_to_prune["cross_attention"],
        pruning_method=prune.L1Unstructured,
        amount=cross_attention_prune_amount,
    )
    prune.global_unstructured(
        parameters_to_prune["ffn"],
        pruning_method=prune.L1Unstructured,
        amount=ff_prune_amount,
    )
    prune.global_unstructured(
        parameters_to_prune["lm_head"],
        pruning_method=prune.L1Unstructured,
        amount=lm_head_amount,
    )
    return 

In [13]:
# prune_model()

In [17]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [18]:
TALib.show_param_ratio(model)

1.0

In [19]:
prune.global_unstructured(
    parameters_to_prune["self_attention"],
    pruning_method=prune.L1Unstructured,
    amount=self_attention_prune_amount,
)

In [20]:
TALib.show_param_ratio(model)

0.8128366470336914

In [21]:
trainer = ta_lib.get_trainer(model)

In [22]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 3.7154033184051514,
 'eval_rouge1': 0.039,
 'eval_rouge2': 0.0055,
 'eval_rougeL': 0.0343,
 'eval_rougeLsum': 0.0343,
 'eval_gen_len': 18.9376,
 'eval_runtime': 777.4345,
 'eval_samples_per_second': 4.205,
 'eval_steps_per_second': 2.103}

In [23]:
trainer = ta_lib.get_trainer(model , num_train_epochs=15 , batch_size=5 ,output_dir="./output/pruning_v3")

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.5526,2.21481,0.2274,0.1745,0.2164,0.2164,18.9704
2,2.3259,2.081054,0.233,0.1833,0.224,0.2239,18.9884
3,2.0882,2.019241,0.235,0.1864,0.2266,0.2266,18.9918
4,2.1731,1.966064,0.2365,0.1887,0.2283,0.2285,18.9879
5,2.2011,1.934946,0.2378,0.1902,0.2297,0.2297,18.9926
6,2.2409,1.904227,0.2389,0.1915,0.2308,0.2309,18.9966
7,2.0091,1.887683,0.2396,0.1925,0.2316,0.2316,18.996
8,2.0624,1.872282,0.2395,0.1927,0.2317,0.2317,18.996
9,1.9053,1.858434,0.2401,0.1935,0.2324,0.2324,18.9979
10,2.0081,1.851333,0.2402,0.1938,0.2325,0.2326,18.9982




TrainOutput(global_step=45480, training_loss=2.1489335898044555, metrics={'train_runtime': 18728.5176, 'train_samples_per_second': 12.141, 'train_steps_per_second': 2.428, 'total_flos': 6.154939105542144e+16, 'train_loss': 2.1489335898044555, 'epoch': 15.0})

In [25]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 1.8115092515945435,
 'eval_rouge1': 0.2391,
 'eval_rouge2': 0.1938,
 'eval_rougeL': 0.2311,
 'eval_rougeLsum': 0.2311,
 'eval_gen_len': 18.9969,
 'eval_runtime': 372.1457,
 'eval_samples_per_second': 8.784,
 'eval_steps_per_second': 1.757,
 'epoch': 15.0}

In [26]:
TALib.show_param_ratio(model)

0.8128366470336914

In [27]:
prune.global_unstructured(
    parameters_to_prune["cross_attention"],
    pruning_method=prune.L1Unstructured,
    amount=cross_attention_prune_amount,
)

In [28]:
TALib.show_param_ratio(model)

0.7712448239326477

In [29]:
trainer = ta_lib.get_trainer(model , num_train_epochs=20 , batch_size=5 ,output_dir="./output/pruning_v3")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 6.325265407562256,
 'eval_rouge1': 0.0139,
 'eval_rouge2': 0.0001,
 'eval_rougeL': 0.0137,
 'eval_rougeLsum': 0.0137,
 'eval_gen_len': 18.4628,
 'eval_runtime': 386.4415,
 'eval_samples_per_second': 8.459,
 'eval_steps_per_second': 1.692}

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.9372,2.497152,0.215,0.1228,0.1871,0.1872,18.9061
2,2.6268,2.276995,0.2237,0.1465,0.201,0.201,18.9541
3,2.3595,2.180384,0.2299,0.1764,0.2187,0.2187,18.9765
4,2.4244,2.134937,0.2318,0.181,0.2223,0.2223,18.9828
5,2.4406,2.099964,0.2335,0.1834,0.2241,0.2241,18.9916
6,2.4634,2.067773,0.2347,0.185,0.2255,0.2255,18.9926
7,2.2492,2.049789,0.2352,0.1861,0.2263,0.2263,18.9958
8,2.2727,2.036448,0.236,0.1868,0.2269,0.227,18.9966
9,2.1398,2.015613,0.2362,0.1874,0.2274,0.2275,18.9971
10,2.2012,2.005004,0.2367,0.1881,0.228,0.2281,18.9947




TrainOutput(global_step=60640, training_loss=2.3469779018990913, metrics={'train_runtime': 24651.7094, 'train_samples_per_second': 12.299, 'train_steps_per_second': 2.46, 'total_flos': 8.206585474056192e+16, 'train_loss': 2.3469779018990913, 'epoch': 20.0})

In [32]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 1.9339864253997803,
 'eval_rouge1': 0.238,
 'eval_rouge2': 0.1907,
 'eval_rougeL': 0.2293,
 'eval_rougeLsum': 0.2294,
 'eval_gen_len': 18.9939,
 'eval_runtime': 380.9624,
 'eval_samples_per_second': 8.581,
 'eval_steps_per_second': 1.717,
 'epoch': 20.0}

In [33]:
prune.global_unstructured(
    parameters_to_prune["ffn"],
    pruning_method=prune.L1Unstructured,
    amount=ff_prune_amount,
)

In [34]:
TALib.show_param_ratio(model)

0.48010188341140747

In [35]:
trainer = ta_lib.get_trainer(model , num_train_epochs=15 , batch_size=5 ,output_dir="./output/pruning_v3")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [36]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 6.28693962097168,
 'eval_rouge1': 0.0146,
 'eval_rouge2': 0.0001,
 'eval_rougeL': 0.0146,
 'eval_rougeLsum': 0.0146,
 'eval_gen_len': 19.0,
 'eval_runtime': 392.3616,
 'eval_samples_per_second': 8.332,
 'eval_steps_per_second': 1.667}

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.0868,2.531628,0.221,0.1479,0.2004,0.2004,18.9485
2,2.8432,2.43392,0.2278,0.1717,0.216,0.216,18.9908
3,2.5881,2.386825,0.228,0.1771,0.2186,0.2186,18.99
4,2.6892,2.349193,0.2297,0.1794,0.2205,0.2205,18.9934
5,2.7324,2.32447,0.2308,0.1809,0.2219,0.2218,18.9958
6,2.734,2.302664,0.2313,0.1815,0.2225,0.2225,18.9871
7,2.5729,2.287221,0.2323,0.1825,0.2235,0.2235,18.9876
8,2.5834,2.277168,0.2325,0.1826,0.2237,0.2237,18.9897
9,2.4103,2.261664,0.2328,0.1831,0.2242,0.2241,18.9923
10,2.5184,2.256471,0.2334,0.1837,0.2247,0.2247,18.9982




TrainOutput(global_step=45480, training_loss=2.6936071073369385, metrics={'train_runtime': 19609.6108, 'train_samples_per_second': 11.596, 'train_steps_per_second': 2.319, 'total_flos': 6.154939105542144e+16, 'train_loss': 2.6936071073369385, 'epoch': 15.0})

In [38]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 2.212543249130249,
 'eval_rouge1': 0.2343,
 'eval_rouge2': 0.1853,
 'eval_rougeL': 0.2254,
 'eval_rougeLsum': 0.2253,
 'eval_gen_len': 18.9951,
 'eval_runtime': 400.8947,
 'eval_samples_per_second': 8.154,
 'eval_steps_per_second': 1.631,
 'epoch': 15.0}

In [39]:
prune.global_unstructured(
    parameters_to_prune["lm_head"],
    pruning_method=prune.L1Unstructured,
    amount=lm_head_amount,
)

In [40]:
TALib.show_param_ratio(model)

0.26261115074157715

In [41]:
trainer = ta_lib.get_trainer(model , num_train_epochs=30 , batch_size=5 ,output_dir="./output/pruning_v3")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [42]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 6.514431476593018,
 'eval_rouge1': 0.051,
 'eval_rouge2': 0.0224,
 'eval_rougeL': 0.0493,
 'eval_rougeLsum': 0.0493,
 'eval_gen_len': 18.6164,
 'eval_runtime': 388.3105,
 'eval_samples_per_second': 8.419,
 'eval_steps_per_second': 1.684}

In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,5.0021,4.273895,0.1226,0.0685,0.1149,0.1149,18.9649
2,4.2912,3.746312,0.1964,0.1311,0.1848,0.1848,18.9929
3,3.8587,3.49881,0.2086,0.1499,0.1985,0.1985,18.9591
4,3.9128,3.349042,0.2111,0.1551,0.2016,0.2016,18.9354
5,3.9082,3.247055,0.2117,0.158,0.2031,0.2032,18.9507
6,3.8556,3.164701,0.2144,0.1623,0.2062,0.2062,18.9586
7,3.5887,3.108447,0.2177,0.1656,0.2092,0.2092,18.9673
8,3.5781,3.059389,0.221,0.1689,0.2123,0.2124,18.9715
9,3.387,3.010522,0.2235,0.1715,0.2147,0.2148,18.9757
10,3.4961,2.977571,0.2255,0.1733,0.2166,0.2167,18.9739




TrainOutput(global_step=90960, training_loss=3.5635117952091817, metrics={'train_runtime': 40046.5942, 'train_samples_per_second': 11.356, 'train_steps_per_second': 2.271, 'total_flos': 1.2309878211084288e+17, 'train_loss': 3.5635117952091817, 'epoch': 30.0})

In [44]:
trainer.evaluate(ta_lib.tokenized_billsum_test)

{'eval_loss': 2.7338263988494873,
 'eval_rouge1': 0.2343,
 'eval_rouge2': 0.1838,
 'eval_rougeL': 0.2253,
 'eval_rougeLsum': 0.2253,
 'eval_gen_len': 18.9838,
 'eval_runtime': 389.4141,
 'eval_samples_per_second': 8.395,
 'eval_steps_per_second': 1.679,
 'epoch': 30.0}

In [45]:
TALib.show_param_ratio(model)

0.26261115074157715

In [46]:
results = trainer.predict(ta_lib.tokenized_billsum_test)

In [47]:
decoded_prediction = ta_lib.tokenizer.batch_decode(results[0], skip_special_tokens=True)

In [48]:
TALib.dump_to_kaggle_format(decoded_prediction , 'pruned_model_0.26_day_5_6.csv')

Unnamed: 0,ID,Predict
0,0,Amends the Water Resources Development Act of ...
1,1,Federal Forage Fee Act of 1993 - Amends the Fe...
2,2,Merchant Marine of World War II Congressional ...
3,3,Small Business Tax Modernization Act of 2004 -...
4,4,Fair Access to Investment Research Act of 2016...
...,...,...
3264,3264,Public Servant Priority Placement Act of 1995 ...
3265,3265,Sportsmanship of 2008 - Amends the Federal cri...
3266,3266,Helping College Student Cross the Finish Line ...
3267,3267,Texas National Forests Improvement Act of 2000...


In [49]:
TALib.run_score(decoded_prediction , ta_lib.billsum_test)

0.16520012803386583

In [50]:
TALib.save_model(model , "output/pruning_success_v2")

In [51]:
reload_model = TALib.load_model("output/pruning_success_v2")

Some weights of the model checkpoint at output/pruning_success_v2 were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.0.SelfAttention.k.weight_mask', 'decoder.block.0.layer.0.SelfAttention.k.weight_orig', 'decoder.block.0.layer.0.SelfAttention.o.weight_mask', 'decoder.block.0.layer.0.SelfAttention.o.weight_orig', 'decoder.block.0.layer.0.SelfAttention.q.weight_mask', 'decoder.block.0.layer.0.SelfAttention.q.weight_orig', 'decoder.block.0.layer.0.SelfAttention.v.weight_mask', 'decoder.block.0.layer.0.SelfAttention.v.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.k.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.k.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.o.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.o.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.q.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.q.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.v.weight_mask', 'decoder.block.0.layer.1.EncDecAtt

In [52]:
TALib.show_param_ratio(reload_model)

0.26261115074157715

In [53]:
trainer = ta_lib.get_trainer(reload_model , num_train_epochs=30 , batch_size=5 ,output_dir="./output/pruning_check")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [54]:
trainer.evaluate(ta_lib.tokenized_billsum_test)



{'eval_loss': 2.7338263988494873,
 'eval_rouge1': 0.2343,
 'eval_rouge2': 0.1838,
 'eval_rougeL': 0.2253,
 'eval_rougeLsum': 0.2253,
 'eval_gen_len': 18.9838,
 'eval_runtime': 431.505,
 'eval_samples_per_second': 7.576,
 'eval_steps_per_second': 1.516}