In [27]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, load_metric
import numpy as np
import pandas as pd

In [28]:
!pip install --upgrade datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [29]:
df = pd.read_csv("/kaggle/input/huffdata-undersampled4k/test_set.csv")

In [30]:
df = df[["text", 'Summaries']]

In [31]:
df

Unnamed: 0,text,Summaries
0,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."
...,...,...
5444,HONG KONG — Hundreds of pilot whales that s...,more than 500 rescuers tried frantically to se...
5445,"NICE, France — Rivère accepts the complim...",Signing balotelli was not just a way to garner...
5446,FRANKFURT — Germans who never really warmed...,Although there was no evidence of that the bun...
5447,Charles Oakley has strong feelings about compe...,He questioned why any n. b. a. free agent woul...


In [32]:
num_words = df.Summaries.apply(lambda x: len(x.split()))

In [33]:
len(num_words[num_words > 500])

34

In [34]:
max_input_length = 5580
max_output_length = 512
batch_size = 1

In [35]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["text"],
        padding='max_length',
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["Summaries"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [36]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [37]:
rouge = load_metric("rouge")

In [38]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [39]:
raw_data = Dataset.from_pandas(df)
split_data = raw_data.train_test_split(test_size=0.75)
split_data = split_data['train'].train_test_split(test_size = 0.1)

In [40]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [41]:
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True)

In [42]:
led.config.num_beams = 2
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [43]:
train_dataset = split_data['train'].map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["text", "Summaries"]
)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

test_dataset = split_data['test'].map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["text", "Summaries"],
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

  0%|          | 0/1225 [00:00<?, ?ba/s]

  0%|          | 0/137 [00:00<?, ?ba/s]

In [49]:
led

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
      (layers): ModuleList(
        (0-5): 6 x LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
     

In [50]:
for param in led.led.parameters():
    param.requires_grad = False

In [51]:
for param in led.lm_head.parameters():
    param.requires_grad = True

In [52]:
for name, param in led.named_parameters():
     print(name, param.requires_grad)

led.shared.weight True
led.encoder.embed_positions.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.query.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.query.bias False
led.encoder.layers.0.self_attn.longformer_self_attn.key.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.key.bias False
led.encoder.layers.0.self_attn.longformer_self_attn.value.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.value.bias False
led.encoder.layers.0.self_attn.longformer_self_attn.query_global.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.query_global.bias False
led.encoder.layers.0.self_attn.longformer_self_attn.key_global.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.key_global.bias False
led.encoder.layers.0.self_attn.longformer_self_attn.value_global.weight False
led.encoder.layers.0.self_attn.longformer_self_attn.value_global.bias False
led.encoder.layers.0.self_attn.output.weight False
led.encoder

In [47]:
# led was built on old code, so it still uses np.object which has already depreceated. We manually set it here to avoid any errors
np.object = object

In [53]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    output_dir="led-v1",
    logging_steps=250,
    save_strategy="epoch",
    save_total_limit=2,
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    num_train_epochs=3,
)

In [54]:
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
0,0.966,0.902945,0.3409,0.5853,0.4024
1,0.8377,0.837789,0.3852,0.4635,0.3825
2,0.7876,0.820543,0.405,0.4491,0.3885


Non-default generation parameters: {'max_length': 512, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Checkpoint destination directory led-v1/checkpoint-612 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 512, 'min_length': 100, 'early_stopping': True, 'num_beams': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
There were missing keys in the checkpoint model loaded: ['led.encoder.embed_tokens.weight', 'led.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=918, training_loss=0.8422524778411798, metrics={'train_runtime': 6406.0724, 'train_samples_per_second': 0.574, 'train_steps_per_second': 0.143, 'total_flos': 1.350745825591296e+16, 'train_loss': 0.8422524778411798, 'epoch': 3.0})

In [None]:
import torch
from transformers import pipeline