In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [5]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [26]:
import random
import torch

In [27]:

# Example input text
text = "The quick brown fox jumps over the lazy dog. The cat in the hat. She sells seashells by the seashore."

# Split input text into sentences
sentences = text.split('. ')

# Tokenize each sentence and add special tokens
input_ids = []
attention_masks = []
for sentence in sentences:
    encoded = tokenizer.encode_plus(sentence, add_special_tokens=True, return_attention_mask=True)
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])


# Pad input_ids and attention_masks to same length
max_length = max(len(ids) for ids in input_ids)
input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]
attention_masks = [masks + [0] * (max_length - len(masks)) for masks in attention_masks]

# Convert to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

# Create a mask for each sentence
masked_lm_labels = []
for i, sentence in enumerate(sentences):
    # Get the positions of all non-special tokens in the sentence
    non_special_tokens = [idx for idx, token in enumerate(tokenizer.encode(sentence, add_special_tokens=False))]

    # Randomly mask 15% of the non-special tokens
    num_to_mask = max(1, int(len(non_special_tokens) * 0.15))
    mask_positions = random.sample(non_special_tokens, num_to_mask)

    # Create a list of masked language model labels
    masked_lm_label = [-100] * max_length
    for pos in mask_positions:
        masked_lm_label[pos+1] = input_ids[i][pos+1]  # +1 to skip special tokens
        input_ids[i][pos+1] = tokenizer.mask_token_id  # +1 to skip special tokens
    masked_lm_labels.append(masked_lm_label)

# Convert masked_lm_labels to a tensor
masked_lm_labels = torch.tensor(masked_lm_labels)


In [15]:
len(sentences)

3

In [30]:
# Create a mask of the special tokens
special_tokens_mask = torch.zeros_like(input_ids)
for token_id in tokenizer.all_special_ids:
    special_tokens_mask[input_ids == token_id] = 1

# Compute the probability of each token being masked
probability_matrix = torch.full_like(input_ids, fill_value=0.15)
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [28]:
from transformers import BartForConditionalGeneration, BartConfig, BartTokenizer, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
config = BartConfig.from_pretrained('facebook/bart-base', output_hidden_states=True)
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', config=config)

# Prepare data for pretraining
# List of token IDs for each sentence
 # List of attention masks for each sentence
# List of masked language model labels for each sentence

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Create a TrainingArguments object to configure the training process
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save checkpoints and logs
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=1000,
    logging_dir='./logs',
    learning_rate=1e-4,
    warmup_steps=500,
    fp16=True,
)

# Create a Trainer object and start training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=(input_ids, attention_masks, masked_lm_labels),
)
trainer.train()




RuntimeError: ignored

**Training Arguments :**

The TrainingArguments object is created with the following arguments:

output_dir: Directory to save checkpoints and logs.

overwrite_output_dir: Overwrite the content of the output directory.

num_train_epochs: Total number of training epochs to perform.
per_device_train_batch_size: Batch size per GPU/TPU core/CPU for training.

save_steps: Number of updates steps before a checkpoint is saved.

save_total_limit: Limit the total amount of checkpoints to save. 
Deletes the older checkpoints.

logging_steps: Number of updates steps before logging training metrics.

logging_dir: Directory to save logs.

learning_rate: The initial learning rate for Adam.

warmup_steps: Linear warmup over warmup_steps.

fp16: Whether to use 16-bit (mixed) precision (through NVIDIA Apex) instead of 32-bit.