In [None]:
from transformers import TFBertForMaskedLM, BertTokenizer
import tensorflow as tf

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')

# Input sentence with proper mask token
input_text = f"The cat sat on the {tokenizer.mask_token} and looked out the window."

# Tokenize
inputs = tokenizer(input_text, return_tensors="tf")

# Run model
outputs = model(inputs)
logits = outputs.logits

# Find index of [MASK] token
mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0][1].numpy()

# Predict token at [MASK]
predicted_token_id = tf.argmax(logits[0, mask_token_index]).numpy()
predicted_token = tokenizer.decode([predicted_token_id])

top_k = 5
top_k_ids = tf.math.top_k(logits[0, mask_token_index], k=top_k).indices.numpy()
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_ids)

print("Top predictions:")
for i, token in enumerate(top_k_tokens, 1):
    print(f"{i}: {token}")


All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Top predictions:
1: floor
2: bed
3: couch
4: sofa
5: ground


In [None]:
!pip install -q tensorflow transformers datasets

import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, create_optimizer
from datasets import load_dataset

# 1. Load small IMDB subset
dataset = load_dataset("imdb")
small_train = dataset["train"].select(range(500))
small_test = dataset["test"].select(range(100))

# 2. Load tokenizer & model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# 3. Tokenize the text
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

small_train = small_train.map(tokenize, batched=True)
small_test = small_test.map(tokenize, batched=True)

small_train.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
small_test.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

train_tf_dataset = small_train.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols="label",
    shuffle=True,
    batch_size=16
)

test_tf_dataset = small_test.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols="label",
    shuffle=False,
    batch_size=16
)

# 4. Hugging Face create_optimizer (FIX)
num_train_steps = len(train_tf_dataset) * 1  # epochs = 1
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=0
)

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# 5. Train model (1 epoch)
model.fit(train_tf_dataset, validation_data=test_tf_dataset, epochs=1)

# 6. Predict on custom input
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=128)
    logits = model(inputs).logits
    probs = tf.nn.softmax(logits, axis=1).numpy()[0]
    label = "Positive ðŸ˜Š" if probs[1] > probs[0] else "Negative ðŸ˜ž"
    confidence = round(probs.max(), 2)
    return label, confidence

# 7. Example predictions
examples = [
    "This movie was amazing! I loved every moment.",
    "Worst movie ever. Total waste of time.",
    "It was okay, not great but not terrible either."
]

for example in examples:
    sentiment, conf = predict_sentiment(example)
    print(f"\nInput: {example}\nPrediction: {sentiment} (Confidence: {conf})")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]


Input: This movie was amazing! I loved every moment.
Prediction: Negative ðŸ˜ž (Confidence: 0.9800000190734863)

Input: Worst movie ever. Total waste of time.
Prediction: Negative ðŸ˜ž (Confidence: 0.9800000190734863)

Input: It was okay, not great but not terrible either.
Prediction: Negative ðŸ˜ž (Confidence: 0.9800000190734863)
