In [1]:
!pip install transformers -q


In [1]:
import os
import re

import tensorflow as tf

import transformers

In [2]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

strategy = tf.distribute.TPUStrategy(tpu)

print(f"Available number of replicas: {strategy.num_replicas_in_sync}")

Available number of replicas: 8


In [3]:
tokenizer = "tf-tpu/unigram-tokenizer-wikitext"
pretrained_model_config = "roberta-base"

tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer)
config = transformers.AutoConfig.from_pretrained(pretrained_model_config)
config.vocab_size = tokenizer.vocab_size

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [4]:
train_dataset_path = "gs://tf-tpu-training-resources/train"
eval_dataset_path = "gs://tf-tpu-training-resources/validation"

training_records = tf.io.gfile.glob(os.path.join(train_dataset_path, "*.tfrecord"))
eval_records = tf.io.gfile.glob(os.path.join(eval_dataset_path, "*.tfrecord"))

In [5]:
def count_samples(file_list):
    num_samples = 0
    for file in file_list:
        filename = file.split("/")[-1]
        sample_count = re.search(r"-\d+-(\d+)\.tfrecord", filename).group(1)
        sample_count = int(sample_count)
        num_samples += sample_count

    return num_samples


num_train_samples = count_samples(training_records)
print(f"Number of total training samples: {num_train_samples}")

Number of total training samples: 300917


In [6]:
max_sequence_length = 512


def decode_fn(example):
    features = {
        "input_ids": tf.io.FixedLenFeature(
            dtype=tf.int64, shape=(max_sequence_length,)
        ),
        "attention_mask": tf.io.FixedLenFeature(
            dtype=tf.int64, shape=(max_sequence_length,)
        ),
    }
    return tf.io.parse_single_example(example, features)

In [7]:
mlm_probability = 0.15
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=mlm_probability, mlm=True, return_tensors="tf"
)


def mask_with_collator(batch):
    special_tokens_mask = (
        ~tf.cast(batch["attention_mask"], tf.bool)
        | (batch["input_ids"] == tokenizer.cls_token_id)
        | (batch["input_ids"] == tokenizer.sep_token_id)
    )
    batch["input_ids"], batch["labels"] = data_collator.tf_mask_tokens(
        batch["input_ids"],
        vocab_size=len(tokenizer),
        mask_token_id=tokenizer.mask_token_id,
        special_tokens_mask=special_tokens_mask,
    )
    return batch

In [8]:
auto = tf.data.AUTOTUNE
shuffle_buffer_size = 2**18


def prepare_dataset(
    records, decode_fn, mask_fn, batch_size, shuffle, shuffle_buffer_size=None
):
    num_samples = count_samples(records)
    dataset = tf.data.Dataset.from_tensor_slices(records)
    if shuffle:
        dataset = dataset.shuffle(len(dataset))
    dataset = tf.data.TFRecordDataset(dataset, num_parallel_reads=auto)
    # TF can't infer the total sample count because it doesn't read
    #  all the records yet, so we assert it here.
    dataset = dataset.apply(tf.data.experimental.assert_cardinality(num_samples))
    dataset = dataset.map(decode_fn, num_parallel_calls=auto)
    if shuffle:
        assert shuffle_buffer_size is not None
        dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.map(mask_fn, num_parallel_calls=auto)
    dataset = dataset.prefetch(auto)
    return dataset

In [9]:
per_replica_batch_size = 16  # Change as needed.
batch_size = per_replica_batch_size * strategy.num_replicas_in_sync
shuffle_buffer_size = 2**18  # Default corresponds to a 1GB buffer for seq_len 512

train_dataset = prepare_dataset(
    training_records,
    decode_fn=decode_fn,
    mask_fn=mask_with_collator,
    batch_size=batch_size,
    shuffle=True,
    shuffle_buffer_size=shuffle_buffer_size,
)

eval_dataset = prepare_dataset(
    eval_records,
    decode_fn=decode_fn,
    mask_fn=mask_with_collator,
    batch_size=batch_size,
    shuffle=False,
)

In [10]:
single_batch = next(iter(train_dataset))
print(single_batch.keys())

dict_keys(['attention_mask', 'input_ids', 'labels'])


In [11]:
for k in single_batch:
    if k == "input_ids":
        input_ids = single_batch[k]
        print(f"Input shape: {input_ids.shape}")
    if k == "labels":
        labels = single_batch[k]
        print(f"Label shape: {labels.shape}")

Input shape: (128, 512)
Label shape: (128, 512)


In [12]:
idx = 0
print("Taking the first sample:\n")
print(tokenizer.decode(input_ids[idx].numpy()))

Taking the first sample:

or not is immaterial ". 
[SEP][CLS] 3. Asteya ( Non @-@ thi[MASK]ving ) - According to Puruşārthas[MASK]ddhyupāya : sugar
[SEP][CLS] Driven by passions, taking anything that has not been given be termed as theft and since[MASK]ft causes injury,[MASK]it casemateis hiṃsā 
[SEP][CLS] 4.[MASK]charya- It means chastity for householders and celibacy in action, words & thoughts for ascetics. 
[SEP][CLS] Unchastity ( abrahma ) is copulation[MASK]arising from[MASK] desire. There is all @-@ round injury to the living in copulation and, therefore, [MASK] is h[MASK]ṃsā. 
[SEP][CLS] Just as a hot rod[MASK] iron inserted into [MASK] tube filled with sesame seeds burn[MASK][MASK] up, in the same way, many beings get [MASK][MASK] sexual intercourse[MASK]
[SEP][CLS] 5. Ap[MASK]graha[MASK]([MASK][MASK]@[MASK]@ possession[MASK]) - According to[MASK] texts [MASK] attachment to possessions [MASK] parigra[MASK] ) is of two[MASK][MASK] :[MASK] to internal possession[MASK][MASK]( āb[

In [13]:
print(labels[0].numpy()[:30])


[-100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100]


In [14]:
# For this example, we keep this value to 10. But for a realistic run, start with 500.
num_epochs = 10
steps_per_epoch = num_train_samples // (
    per_replica_batch_size * strategy.num_replicas_in_sync
)
total_train_steps = steps_per_epoch * num_epochs
learning_rate = 0.0001
weight_decay_rate = 1e-3

with strategy.scope():
    model = transformers.TFAutoModelForMaskedLM.from_config(config)
    model(
        model.dummy_inputs
    )  # Pass some dummy inputs through the model to ensure all the weights are built
    optimizer, schedule = transformers.create_optimizer(
        num_train_steps=total_train_steps,
        num_warmup_steps=total_train_steps // 20,
        init_lr=learning_rate,
        weight_decay_rate=weight_decay_rate,
    )
    model.compile(optimizer=optimizer, metrics=["accuracy"])

In [None]:
hub_model_id = output_dir = "masked-lm-tpu"

callbacks = []
callbacks.append(
    transformers.PushToHubCallback(
        output_dir=output_dir, hub_model_id=hub_model_id, tokenizer=tokenizer
    )
)

In [None]:
model.fit(
    train_dataset.take(2),
    validation_data=eval_dataset.take(2),
    epochs=num_epochs,
    callbacks=callbacks,
)

# After training we also serialize the final model.
model.save_pretrained(output_dir)

In [None]:
from transformers import pipeline

# Replace your `model_id` here.
# Here, we're using a model that the Hugging Face team trained for longer.
model_id = "tf-tpu/roberta-base-epochs-500-no-wd"
unmasker = pipeline("fill-mask", model=model_id, framework="tf")
print(unmasker("Goal of my life is to [MASK]."))