### Import Packages

In [1]:
import tensorflow as tf
from transformers import TFAutoModelForMaskedLM, BertTokenizerFast, DataCollatorForLanguageModeling

## Load model from hugging face

In [2]:
model = TFAutoModelForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Load file

In [3]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/t_bbe.csv')

In [4]:
# Select only Genesis book
genesis = df[df['b'] == 1]
genesis.head()

Unnamed: 0,id,b,c,v,t
0,1001001,1,1,1,At the first God made the heaven and the earth.
1,1001002,1,1,2,And the earth was waste and without form; and ...
2,1001003,1,1,3,"And God said, Let there be light: and there wa..."
3,1001004,1,1,4,"And God, looking on the light, saw that it was..."
4,1001005,1,1,5,"Naming the light, Day, and the dark, Night. An..."


In [7]:
# test tokenizer result
tokenizer.tokenize("In the beginning God created the heaven and the earth.")

['in',
 'the',
 'beginning',
 'god',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [8]:
# convert target column to list
sentences = genesis['t'].tolist()

from nltk.tokenize import sent_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# break down into sentences
sentence = []
for sent in sentences:
  sentence.append(" ".join(sent_tokenize(sent)))

In [10]:
# adjust the tokenizer
tokenized_inputs = tokenizer(
    sentence,
    return_tensors="tf",
    padding=True,  # Enable padding to the longest sequence
    truncation=True,
    max_length=32  # You can set this based on your requirements
)

In [11]:
tokenized_inputs

{'input_ids': <tf.Tensor: shape=(1533, 32), dtype=int32, numpy=
array([[ 101, 2012, 1996, ...,    0,    0,    0],
       [ 101, 1998, 1996, ..., 1996, 2227,  102],
       [ 101, 1998, 2643, ...,    0,    0,    0],
       ...,
       [ 101, 2059, 3312, ..., 2455, 2046,  102],
       [ 101, 2059, 3312, ..., 5944, 2185,  102],
       [ 101, 2061, 3312, ..., 3108, 1999,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1533, 32), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1533, 32), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

In [12]:
input_ids = tokenized_inputs['input_ids'].numpy()
attention_mask = tokenized_inputs['attention_mask'].numpy()

In [13]:
# split data
from sklearn.model_selection import train_test_split
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask = train_test_split(
    input_ids,
    attention_mask,
    test_size=0.2,
    random_state=42  # Set a random state for reproducibility
)

In [14]:
# create tensorflow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': train_input_ids,
        'attention_mask': train_attention_mask
    }
))

In [15]:
val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': val_input_ids,
        'attention_mask': val_attention_mask
    }
))

In [16]:
# Initialize the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # Enable MLM
    mlm_probability=0.15  # Set the masking probability
)

In [17]:
# preparing for model training
train_dataset = train_dataset.map(lambda x: (
    {
        'input_ids': x['input_ids'],
        'attention_mask': x['attention_mask']
    },
    x['input_ids']  # Labels are the input_ids themselves
))

In [18]:
val_dataset = val_dataset.map(lambda x: (
    {
        'input_ids': x['input_ids'],
        'attention_mask': x['attention_mask']
    },
    x['input_ids']
))

In [19]:
# eliminate the order from for model not to capture
train_dataset = train_dataset.shuffle(buffer_size=10000)

batch_size = 2   # how many inputs processed

train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [20]:
print("\nValidation dataset prepared:")
for batch in val_dataset.take(1):  # Show one batch
    print(batch)


Validation dataset prepared:
({'input_ids': <tf.Tensor: shape=(2, 32), dtype=int32, numpy=
array([[  101,  2292,  2033,  2175,  2083,  2035,  2115, 19311,  2651,
         1010,  2635,  2041,  2013,  2426,  2068,  2035,  1996,  8351,
         2029,  2024,  4417,  2030, 11401,  2030,  2304,  1010,  1998,
         2035,  1996,  4417,  2030,   102],
       [  101,  1998,  1996,  2935,  2643,  2081,  2005,  4205,  1998,
         2005,  2010,  2564, 15695,  1997, 21049,  2005,  2037,  5929,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 32), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}, <tf.Tensor: shape=(2, 32), dtype=int32, numpy=
array([[  101,  2292,  

In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer)

In [22]:
# model training
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    callbacks=[early_stopping]
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
# Save the model
model.save_pretrained('/content/drive/MyDrive/mlm2')
tokenizer.save_pretrained('/content/drive/MyDrive/mlm2')

('/content/drive/MyDrive/mlm2/tokenizer_config.json',
 '/content/drive/MyDrive/mlm2/special_tokens_map.json',
 '/content/drive/MyDrive/mlm2/vocab.txt',
 '/content/drive/MyDrive/mlm2/added_tokens.json',
 '/content/drive/MyDrive/mlm2/tokenizer.json')

## Load model

In [28]:
from transformers import pipeline, AutoModelForMaskedLM, BertTokenizerFast

# Load the model and tokenizer
model = AutoModelForMaskedLM.from_pretrained('/content/drive/MyDrive/mlm2', from_tf = True)
tokenizer = BertTokenizerFast.from_pretrained('/content/drive/MyDrive/mlm2')

# Create a fill-mask pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)


All TF 2.0 model weights were used when initializing BertForMaskedLM.

Some weights of BertForMaskedLM were not initialized from the TF 2.0 model and are newly initialized: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [35]:
# ignore [PAD] prediction
pad_token_id = tokenizer.pad_token_id

text = "Worship your [MASK] ."

# Give a text to predict
result = fill_mask( text )

# Filter out the [PAD] token from the results
filtered_result = [r for r in result if r['token'] != pad_token_id]

# Print the filtered result
for r in filtered_result:
    print(f"Predicted word: {r['token_str']}, Score: {r['score']}")

Predicted word: god, Score: 0.10508973151445389
Predicted word: father, Score: 0.06969263404607773
Predicted word: body, Score: 0.04692128673195839
Predicted word: son, Score: 0.03123021498322487
Predicted word: daughter, Score: 0.023794498294591904
