# Finetuning Bart large on CNN daily news dataset

In [2]:
### MODULES ###

import sys,os
import tqdm
import csv
from datetime import datetime 
import numpy as np
import pandas as pd
import json


from datasets import load_dataset, Dataset

import torch
from torch import cuda
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn.functional as F


# Load the ROUGE metric
import evaluate

from transformers import AutoTokenizer, BartForConditionalGeneration

In [3]:

NUM_PROCS = os.cpu_count() 

print("NUM_PROCS = " ,NUM_PROCS)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)



NUM_PROCS =  12
cuda


In [4]:

SEED = 42
NUM_LOADER = 4 #config['config_machine']["NUM_LOADER"] #depends of the number of thread 


# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# Load dataset CNN daily

In [5]:
# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

## Comment this part for the real training time :

percentage = 0.05

for split in dataset: 
    dataset[split] = dataset[split].shuffle(seed=SEED).select(range(int(len(dataset[split]) * percentage)))

# Check the dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 14355
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 668
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 574
    })
})


# load the model and tokenizer 

In [6]:
### Load model ###
MODEL_HUB = 'facebook/bart-large'
# Load Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_HUB, clean_up_tokenization_spaces=True)
model = BartForConditionalGeneration.from_pretrained(MODEL_HUB, forced_bos_token_id=0)
print(tokenizer.model_max_length)
print(type(tokenizer))
print(type(model))
print(tokenizer)

1024
<class 'transformers.models.bart.tokenization_bart_fast.BartTokenizerFast'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartTokenizerFast(name_or_path='facebook/bart-large', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip

In [7]:
def len_distrib(batch):

    len_articles = []
    len_highlights = []
    
    for article, highlight in zip(batch["article"], batch["highlights"]):
        len_articles.append(len(tokenizer(article, truncation=False)["input_ids"]))
        len_highlights.append(len(tokenizer(highlight, truncation=False)["input_ids"]))


    source = tokenizer(batch["article"],truncation=True, max_length=tokenizer.model_max_length,padding='max_length')
    resume = tokenizer(batch["highlights"],truncation=True, max_length=tokenizer.model_max_length,padding='max_length')

    return {
        'input_ids': source['input_ids'], 
        'input_mask': source['attention_mask'],
        'input_len': len_articles,
        'target_ids': resume['input_ids'], 
        'target_mask': resume['attention_mask'],
        'target_len': len_highlights
        }


dataset = dataset.map(len_distrib,num_proc=NUM_PROCS,batched=True,batch_size=64)# Save the Hugging Face dataset


Map (num_proc=12):   0%|          | 0/14355 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1153 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1105 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1498 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1210 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1585 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=12):   0%|          | 0/668 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1064 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1048 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1752 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1380 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2112 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=12):   0%|          | 0/574 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1647 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1153 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1917 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1592 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

In [8]:

# Define the custom collate function
def collate_fn(batch):
    """
    Custom collate function that add padding for each batch.
    """

    # Pad the tokenized content
    input_ids = torch.tensor([item['input_ids'] for item in batch], dtype=torch.long)
    # input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    attention_mask = torch.tensor([item['input_mask']for item in batch], dtype=torch.long) 
    # attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    decoder_input_ids  = torch.tensor([item['target_ids'] for item in batch], dtype=torch.long) #item['target_ids'][:-1]
    # decoder_input_ids = pad_sequence(decoder_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)     
    
    decoder_attention_mask = torch.tensor([item['target_mask'] for item in batch], dtype=torch.long)#item['target_mask'][:-1]
    # decoder_attention_mask = pad_sequence(decoder_attention_mask, batch_first=True, padding_value=0)
    
    input_len = torch.tensor([item['input_len'] for item in batch], dtype=torch.long)

    target_len = torch.tensor([item['target_len'] for item in batch], dtype=torch.long)


    return {
        'input_ids':input_ids,
        'attention_mask':attention_mask,
        'decoder_input_ids':decoder_input_ids,
        'decoder_attention_mask':decoder_attention_mask,
        'input_len': input_len,
        'target_len': target_len
    }


train_params = {
    'batch_size': 4,
    'shuffle': True,
    'collate_fn':collate_fn,
    'num_workers': NUM_LOADER,
    'pin_memory': True  #  Enables faster GPU transfers
    }

eval_params = {
    'batch_size': 4,
    'shuffle': False,
    'collate_fn':collate_fn,
    'num_workers': NUM_LOADER,
    'pin_memory': True  #  Enables faster GPU transfers
    }


# This will be used down for training and validation stage for the model.
train_loader = DataLoader(dataset["train"], **train_params)
eval_loader = DataLoader(dataset["validation"], **eval_params)

for batch in train_loader:
    print(batch)
    break


{'input_ids': tensor([[    0,  2765,   479,  ...,     1,     1,     1],
        [    0,   250,  3828,  ...,     5,  1151,     2],
        [    0,   250, 17052,  ...,     1,     1,     1],
        [    0,   970,    58,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'decoder_input_ids': tensor([[    0,  9058,  2152,  ...,     1,     1,     1],
        [    0,   104,  4774,  ...,     1,     1,     1],
        [    0,   250,  1150,  ...,     1,     1,     1],
        [    0, 14563,  7414,  ...,     1,     1,     1]]), 'decoder_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'input_len': tensor([ 331, 1393,  727,  498]), 'target_len': tensor([ 53, 100,  97,  58])}


# Manipulation on the different part of the Bart model 

In [27]:

# Get all parent classes in the MRO (Method Resolution Order)
print(BartForConditionalGeneration.__mro__)

(<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>, <class 'transformers.models.bart.modeling_bart.BartPreTrainedModel'>, <class 'transformers.modeling_utils.PreTrainedModel'>, <class 'torch.nn.modules.module.Module'>, <class 'transformers.modeling_utils.ModuleUtilsMixin'>, <class 'transformers.generation.utils.GenerationMixin'>, <class 'transformers.utils.hub.PushToHubMixin'>, <class 'transformers.integrations.peft.PeftAdapterMixin'>, <class 'object'>)


In [9]:
## Test de génération from huggingface :

tokenizer.batch_decode(model.generate(**tokenizer(["UN Chief Says There Is No <mask> in Syria"], return_tensors="pt")),skip_special_tokens=True )




['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']

In [None]:
mask = ~torch.isin(batch["decoder_input_ids"],torch.tensor([tokenizer.pad_token_id])) # mask with 0 where a pad_id is present

reversed_position_input  = torch.ones(mask.shape) * mask # [1,1,1,0,0] 

reversed_position_input = torch.flip(torch.flip(reversed_position_input , dims=(1,)).cumsum(dim=1), dims=(1,))  

print(reversed_position_input )

normal_round = torch.randn(batch["decoder_input_ids"].shape) * mask

reversed_position_input = torch.abs(torch.round(reversed_position_input  + normal_round)).to(torch.long) #add a gausian noise and converte to long
print(reversed_position_input)

# input_decoder_position_embedding = model.model.decoder.embed_positions(reversed_position_input)

# input_decoder_position_embedding

tensor([[ 53.,  52.,  51.,  ...,   0.,   0.,   0.],
        [100.,  99.,  98.,  ...,   0.,   0.,   0.],
        [ 97.,  96.,  95.,  ...,   0.,   0.,   0.],
        [ 58.,  57.,  56.,  ...,   0.,   0.,   0.]])
tensor([[ 52,  51,  51,  ...,   0,   0,   0],
        [ 98, 100,  99,  ...,   0,   0,   0],
        [ 97,  96,  96,  ...,   0,   0,   0],
        [ 57,  56,  57,  ...,   0,   0,   0]])


In [37]:
print(model.model.decoder)
token_embeddings = model.model.decoder.embed_tokens(batch["decoder_input_ids"]) 
position_embeddings = model.model.decoder.embed_positions(batch["decoder_input_ids"]) 

BartDecoder(
  (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
  (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-11): 12 x BartDecoderLayer(
      (self_attn): BartSdpaAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder_attn): BartSdpaAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=

In [155]:
# # dir(model.model.decoder.layernorm_embedding)
# import matplotlib.pyplot as plt
# plt.hist(model.model.decoder.layernorm_embedding.weight.detach().numpy())

from torch import nn
import math

d_model = 1024
max_len = tokenizer.model_max_length

pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

div_term = torch.exp(-2*(torch.arange(0, d_model)//2) / d_model * math.log(10000.0) )

print(div_term.shape)

print(position* div_term)


pe[:, 0::2] = torch.sin(position*div_term[0::2] )
pe[:, 1::2] = torch.cos(position*div_term[1::2])

embed_reverse_positions = nn.Embedding(num_embeddings=tokenizer.model_max_length,
                                              embedding_dim=d_model,
                                              padding_idx=tokenizer.pad_token_id,
                                              _weight=pe,
                                              _freeze=True)

embed_reverse_positions(reversed_position_input)

torch.Size([1024])
tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0000e+00, 1.0000e+00, 9.8217e-01,  ..., 1.0366e-04, 1.0182e-04,
         1.0182e-04],
        [2.0000e+00, 2.0000e+00, 1.9643e+00,  ..., 2.0733e-04, 2.0363e-04,
         2.0363e-04],
        ...,
        [1.0210e+03, 1.0210e+03, 1.0028e+03,  ..., 1.0584e-01, 1.0395e-01,
         1.0395e-01],
        [1.0220e+03, 1.0220e+03, 1.0038e+03,  ..., 1.0594e-01, 1.0406e-01,
         1.0406e-01],
        [1.0230e+03, 1.0230e+03, 1.0048e+03,  ..., 1.0605e-01, 1.0416e-01,
         1.0416e-01]])


tensor([[[ 0.9866, -0.1630,  0.7225,  ...,  1.0000,  0.0053,  1.0000],
         [ 0.6702,  0.7422, -0.1738,  ...,  1.0000,  0.0052,  1.0000],
         [ 0.6702,  0.7422, -0.1738,  ...,  1.0000,  0.0052,  1.0000],
         ...,
         [ 0.0000,  1.0000,  0.0000,  ...,  1.0000,  0.0000,  1.0000],
         [ 0.0000,  1.0000,  0.0000,  ...,  1.0000,  0.0000,  1.0000],
         [ 0.0000,  1.0000,  0.0000,  ...,  1.0000,  0.0000,  1.0000]],

        [[-0.5734, -0.8193,  0.9072,  ...,  0.9999,  0.0100,  1.0000],
         [-0.5064,  0.8623, -0.7365,  ...,  0.9999,  0.0102,  0.9999],
         [-0.9992,  0.0398,  0.1537,  ...,  0.9999,  0.0101,  0.9999],
         ...,
         [ 0.0000,  1.0000,  0.0000,  ...,  1.0000,  0.0000,  1.0000],
         [ 0.0000,  1.0000,  0.0000,  ...,  1.0000,  0.0000,  1.0000],
         [ 0.0000,  1.0000,  0.0000,  ...,  1.0000,  0.0000,  1.0000]],

        [[ 0.3796, -0.9251,  0.8536,  ...,  0.9999,  0.0099,  1.0000],
         [ 0.9836, -0.1804,  0.0407,  ...,  1