# Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 4.0 MB 4.1 MB/s 
[K     |████████████████████████████████| 77 kB 5.6 MB/s 
[K     |████████████████████████████████| 880 kB 58.2 MB/s 
[K     |████████████████████████████████| 596 kB 77.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 54.5 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
# Seed
seed = 42
from tensorflow.random import set_seed
set_seed(seed)

# Utils
import os
import re
import math
import numpy as np
import pandas as pd

# Deep Learning
import tensorflow as tf
import tensorflow.keras as keras
from transformers import LEDTokenizerFast, TFLEDForConditionalGeneration

# Sklearn
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit

# Metrics
!pip install -q rouge_score
!pip install -q datasets
from datasets import load_metric

# Speed Optimization
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

[K     |████████████████████████████████| 325 kB 4.3 MB/s 
[K     |████████████████████████████████| 136 kB 57.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 75.2 MB/s 
[K     |████████████████████████████████| 212 kB 76.7 MB/s 
[K     |████████████████████████████████| 127 kB 56.7 MB/s 
[K     |████████████████████████████████| 144 kB 85.2 MB/s 
[K     |████████████████████████████████| 271 kB 57.2 MB/s 
[K     |████████████████████████████████| 94 kB 3.5 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [24]:
config = {'paths' : ['/content/drive/MyDrive/debate2019.csv'],
          'pretrained' : 'allenai/led-base-16384',
          'batch_size' : 1,
          'max_lr': 5e-7,
          'epochs' : 5,
          'tok_input' : {'padding' : 'max_length',
                         'truncation' : True,
                         'max_length' : 4096,
                         'add_special_tokens' : True,
                         'return_tensors' : 'tf',
                         'is_split_into_words' : False,
                         'return_offsets_mapping' : False},
          
          'tok_output' : {'padding' : 'max_length',
                          'truncation' : True,
                          'max_length' : 256,
                          'add_special_tokens' : True,
                          'return_tensors' : 'tf',
                          'is_split_into_words' : False,
                          'return_offsets_mapping' : False},
         }

# Dataset

In [5]:
class Dataset:
    def __init__(self) :
        paths = config['paths']
        self.data = pd.read_csv(paths[0])
        self.data = self.data[self.data['#WordsDocument'] > 1000]
        self.tokenizer = LEDTokenizerFast.from_pretrained(config['pretrained'])
    
    def _split(self):
        # self.data['y_bins'] = pd.cut(self.data['AbsCompressionRatio'], bins = 5)
        # sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.1, random_state = seed)
        # self.data['fold'] = 0
        
        # for t_, v_ in sss.split(self.data.drop('y_bins', axis=1), self.data['y_bins']): 
        #     self.data.loc[t_,'fold'] = 1
        
        self.data['fold'] = 0
        ss = ShuffleSplit(n_splits = 1, test_size = 0.1, random_state = seed)
        for t_, v_ in ss.split(self.data): 
            self.data['fold'].iloc[t_] = 1
        
    def _tokenize(self, i):
        x = self.tokenizer(self.data[self.data['fold'] == i]['Full-Document'].values.tolist(), 
                           **config['tok_input'])  
        
        y = self.tokenizer(self.data[self.data['fold'] == i]['Abstract'].values.tolist(), 
                           **config['tok_output'])
        
        # For text summarization the paper encourage to only set the global attention for the 1st token.
        glob_attn = len(x["input_ids"]) * [[0 for _ in range(len(x["input_ids"][0]))]]
        glob_attn[0][0] = 1
        
        return ({'input_ids': x['input_ids'],
                'attention_mask' : x['attention_mask'],
                'decoder_attention_mask' : y['attention_mask'],
                'global_attention_mask' : tf.constant(glob_attn, dtype = tf.int32)}, y['input_ids'])
    
    def _to_tf(self, ds):
        return tf.data.Dataset.from_tensor_slices(ds).batch(config['batch_size']) \
                                                     .prefetch(1)                                           
    
    def get(self) :
        self._split()
        trainset = self._tokenize(1)
        valset = self._tokenize(0)
        
        return (self._to_tf(trainset),
                self._to_tf(valset))
            

In [6]:
trainset, valset = Dataset().get()

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


# Model

In [26]:
keras.backend.clear_session()

class Longformer(keras.Model):
    def __init__(self):
        super(Longformer, self).__init__()
        self.model = TFLEDForConditionalGeneration.from_pretrained(config['pretrained'], 
                                                                   return_dict = True)
        
    def call(self, inputs, training = False):
        x, y = inputs
        outputs = self.model(input_ids = x['input_ids'], 
                             attention_mask = x['attention_mask'], 
                             decoder_attention_mask = x['decoder_attention_mask'],
                             labels = y, 
                             global_attention_mask = x['global_attention_mask'])
        return outputs.loss, outputs.logits
    
    def train_step(self, data):
        with tf.GradientTape() as tape:
            loss, logits = self(data, training=True)

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        return {"loss": tf.reduce_mean(loss)}
    
    
    def test_step(self, data):
        loss, logits = self(data, training=False)
        
        return {"loss": tf.reduce_mean(loss)}

In [27]:
led = Longformer()
led.compile(optimizer = keras.optimizers.Adam(config['max_lr']))
# led.model.from_pretrained('/content/drive/MyDrive/LED-fine-tune')

All model checkpoint layers were used when initializing TFLEDForConditionalGeneration.

All the layers of TFLEDForConditionalGeneration were initialized from the model checkpoint at allenai/led-base-16384.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLEDForConditionalGeneration for predictions without further training.


# 1Cycle Scheduler

In [21]:
K = keras.backend

class OneCycleLr(keras.callbacks.Callback):
    def __init__(self,
                 max_lr: float,
                 total_steps: int = None,
                 epochs: int = None,
                 steps_per_epoch: int = None,
                 pct_start: float = 0.3,
                 anneal_strategy: str = "cos",
                 cycle_momentum: bool = True,
                 base_momentum: float = 0.85,
                 max_momentum: float = 0.95,
                 div_factor: float = 25.0,
                 final_div_factor: float = 1e4):

        super(OneCycleLr, self).__init__()

        # validate total steps:
        if total_steps :
            self.total_steps = total_steps
        else:
            self.total_steps = epochs * steps_per_epoch

        self.step_num = 0
        self.step_size_up = float(pct_start * self.total_steps) - 1
        self.step_size_down = float(self.total_steps - self.step_size_up) - 1

        # Validate pct_start
        if anneal_strategy == "cos":
            self.anneal_func = self._annealing_cos
        elif anneal_strategy == "linear":
            self.anneal_func = self._annealing_linear

        # Initialize learning rate variables
        self.initial_lr = max_lr / div_factor
        self.max_lr = max_lr
        self.min_lr = self.initial_lr / final_div_factor

        # Initial momentum variables
        self.cycle_momentum = cycle_momentum
        if self.cycle_momentum:
            self.m_momentum = max_momentum
            self.momentum = max_momentum
            self.b_momentum = base_momentum

        # Initialize variable to learning_rate & momentum
        self.track_lr = []
        self.track_mom = []

    def _annealing_cos(self, start, end, pct):
        cos_out = math.cos(math.pi * pct) + 1
        return end + (start - end) / 2.0 * cos_out

    def _annealing_linear(self, start, end, pct):
        return (end - start) * pct + start

    def set_lr_mom(self):
        if self.step_num <= self.step_size_up:
            # update learining rate
            computed_lr = self.anneal_func(self.initial_lr, self.max_lr, self.step_num / self.step_size_up)
            K.set_value(self.model.optimizer.lr, computed_lr)
            # update momentum if cycle_momentum
            if self.cycle_momentum:
                computed_momentum = self.anneal_func(self.m_momentum, self.b_momentum, self.step_num / self.step_size_up)
                try:
                    K.set_value(self.model.optimizer.momentum,
                                computed_momentum)
                except:
                    K.set_value(self.model.optimizer.beta_1, computed_momentum)
        else:
            down_step_num = self.step_num - self.step_size_up
            # update learning rate
            computed_lr = self.anneal_func(self.max_lr, self.min_lr, down_step_num / self.step_size_down)
            K.set_value(self.model.optimizer.lr, computed_lr)
            # update momentum if cycle_momentum
            if self.cycle_momentum:
                computed_momentum = self.anneal_func(self.b_momentum, self.m_momentum, down_step_num / self.step_size_down)
                try:
                    K.set_value(self.model.optimizer.momentum,
                                computed_momentum)
                except:
                    K.set_value(self.model.optimizer.beta_1, computed_momentum)

    def on_train_begin(self, logs=None):
        # Set initial learning rate & momentum values
        K.set_value(self.model.optimizer.lr, self.initial_lr)
        if self.cycle_momentum:
            try:
                K.set_value(self.model.optimizer.momentum, self.momentum)
            except:
                K.set_value(self.model.optimizer.beta_1, self.momentum)

    def on_train_batch_end(self, batch, logs=None):
        # Grab the current learning rate & momentum
        lr = float(K.get_value(self.model.optimizer.lr))
        try:
            mom = float(K.get_value(self.model.optimizer.momentum))
        except:
            mom = float(K.get_value(self.model.optimizer.beta_1))
        # Append to the list
        self.track_lr.append(lr)
        self.track_mom.append(mom)
        # Update learning rate & momentum
        self.set_lr_mom()
        # increment step_num
        self.step_num += 1

# Training

In [22]:
scheduler = OneCycleLr(max_lr=config['max_lr'], 
                       steps_per_epoch=trainset.cardinality().numpy(), 
                       epochs=config['epochs'])

checkpoint = keras.callbacks.ModelCheckpoint(filepath = '/content/drive/MyDrive/ckpt_led',
                                             save_best_only = True,
                                             save_weights_only = True)
early = keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)

In [None]:
history = led.fit(trainset,
                  validation_data = valset,
                  callbacks = [early, scheduler],
                  verbose = 1,
                  epochs = config['epochs'])

led.model.save_pretrained('/content/drive/MyDrive')

In [None]:
led.summary()

Model: "longformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tfled_for_conditional_gener  multiple                 161894745 
 ation (TFLEDForConditionalG                                     
 eneration)                                                      
                                                                 
Total params: 161,894,745
Trainable params: 161,844,480
Non-trainable params: 50,265
_________________________________________________________________


# Testing

In [None]:
class TextSummarization():
  def __init__(self, pretrained, tok, beam, temperature):

    self.model = Longformer().model.from_pretrained(pretrained)
    self.tokenizer = LEDTokenizerFast.from_pretrained(tok)
    self.beam = beam
    self.temperature = temperature

  def generate(self, text):
    text = self.tokenizer(text, **config['tok_input']).input_ids
    tokens = self.model.generate(text,
                                 min_length = 0,
                                 max_length = 128,
                                 num_beams = self.beam,
                                 temperature = self.temperature,
                                 do_sample = True,
                                 repetition_penalty = 2.5,
                                 length_penalty = 1,
                                 early_stopping = True).numpy()[0]
    return self.tokenizer.decode(tokens, skip_special_tokens = True)

  def post_processing(self, text):
    text = text[:text.rfind('.')+1]
    return re.sub("[\(\[].*?[\)\]]", "", text)

  def summarize(self, text):
    text = self.generate(text)
    text = self.post_processing(text)
    return text

ts = TextSummarization(pretrained = '/content/drive/MyDrive/LED', tok = 'allenai/led-base-16384',beam = 5, temperature = 1.2)

All model checkpoint layers were used when initializing TFLEDForConditionalGeneration.

All the layers of TFLEDForConditionalGeneration were initialized from the model checkpoint at allenai/led-base-16384.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLEDForConditionalGeneration for predictions without further training.
All model checkpoint layers were used when initializing TFLEDForConditionalGeneration.

All the layers of TFLEDForConditionalGeneration were initialized from the model checkpoint at /content/drive/MyDrive/LED.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLEDForConditionalGeneration for predictions without further training.


In [None]:
def compute_rouge(id_doc, prediction):
    rouge = load_metric("rouge")
    rouge.add(reference = data['Abstract'].values.tolist()[id_doc], prediction = summary)
    score_abs = rouge.compute()['rougeL']
    rouge = load_metric("rouge")
    rouge.add(reference = data['Extract'].values.tolist()[id_doc], prediction = summary)
    score_ext = rouge.compute()['rougeL']
    return score_abs.mid, score_ext.mid

def compute_bleu(id_doc, prediction):
    bleu = load_metric("bleu")
    score_abs = bleu.compute(predictions=[data['Abstract'].values.tolist()[id_doc].split()], 
                             references=[[summary.split()]], max_order = 1)['bleu']
    bleu = load_metric("bleu")
    score_ext = bleu.compute(predictions=[data['Extract'].values.tolist()[id_doc].split()], 
                             references=[[summary.split()]], max_order = 1)['bleu']
    return score_abs, score_ext

In [None]:
paths = config['paths']
data = pd.read_csv(paths[0])
id_doc = 17330

print('Abstract :')
print(data['Abstract'].values.tolist()[id_doc])
print()
summary = ts.summarize(data['Full-Document'].values.tolist()[id_doc])
print('Summary :')
print(summary)

Abstract :
China views arms sales as vital to crowding out regional threats

Summary :
China’s arms sales are a success story – evidence proves.


In [None]:
abs, ext = compute_rouge(id_doc, summary)
print('Socre on Abstract :', abs)
print('Socre on Extract :', ext)

Socre on Abstract : Score(precision=0.3, recall=0.2727272727272727, fmeasure=0.28571428571428564)
Socre on Extract : Score(precision=0.6, recall=0.027906976744186046, fmeasure=0.05333333333333332)


In [None]:
abs, ext = compute_bleu(id_doc, summary)
print('Socre on Abstract :', abs)
print('Socre on Extract :', ext)

Socre on Abstract : 0.18181818181818182
Socre on Extract : 0.030303030303030304
