In [None]:
!pip install transformers torch tensorflow --quiet

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import mixed_precision
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.datasets import imdb


hp = {
    'MAX_WORD_INDEX': 20000,
    'MAX_LEN': 128,
    'BATCH_SIZE': 32,
    'EPOCHS': 3,
    'LEARNING_RATE': 0.00002,
    'MAX_SAMPLES': 2000,
    'MAX_TEST_SAMPLES': 750
}


word_index = imdb.get_word_index()

word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
reverse_word_index = {v: k for k, v in word_index.items()}

def decode_review(text_ids):
    return ' '.join([reverse_word_index.get(i, '') for i in text_ids])

(train_sequences, train_labels), (test_sequences, test_labels) = imdb.load_data(num_words=hp['MAX_WORD_INDEX'])

if hp['MAX_SAMPLES'] is not None:
    n = min(hp['MAX_SAMPLES'], len(train_sequences))
    train_sequences = train_sequences[:n]
    train_labels = train_labels[:n]

if hp['MAX_TEST_SAMPLES'] is not None:
    m = min(hp['MAX_TEST_SAMPLES'], len(test_sequences))
    test_sequences = test_sequences[:m]
    test_labels = test_labels[:m]


train_textos = [decode_review(x) for x in train_sequences]
test_textos = [decode_review(x) for x in test_sequences]

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def bert_encode(textos, max_len=hp['MAX_LEN']):
    return tokenizer(
        textos,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="tf"
    )

train_encodings = bert_encode(train_textos, hp['MAX_LEN'])
test_encodings = bert_encode(test_textos, hp['MAX_LEN'])

train_ds = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
train_ds = train_ds.shuffle(1000).batch(hp['BATCH_SIZE']).prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))
test_ds = test_ds.batch(hp['BATCH_SIZE']).prefetch(tf.data.AUTOTUNE)

try:
    model = TFDistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2,
        use_safetensors=False
    )
except:
    model = TFDistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2,
        from_pt=True
    )


for layer in model.distilbert.transformer.layer[:3]:
    layer.trainable = False

original_policy = mixed_precision.global_policy()
mixed_precision.set_global_policy(mixed_precision.Policy('float32'))

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=hp['LEARNING_RATE'])

mixed_precision.set_global_policy(original_policy)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

model.compile(optimizer=optimizer, loss=loss_fn, metrics=[metric])

early_stop = keras.callbacks.EarlyStopping(
    patience=1,
    restore_best_weights=True
)

history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=hp['EPOCHS'],
    callbacks=[early_stop]
)

loss, acc = model.evaluate(test_ds)

def classificar_frase(texto):
    enc = tokenizer(texto, return_tensors="tf", truncation=True, padding=True, max_length=hp['MAX_LEN'])
    logits = model(enc).logits
    probs = tf.nn.softmax(logits)[0].numpy()
    rotulo = "POSITIVO" if np.argmax(probs) == 1 else "NEGATIVO"
    return rotulo, float(np.max(probs))




In [None]:
frase_teste = "The movie was okay at the beginning but the ending ruined everything."
lbl, conf = classificar_frase(frase_teste)
print(f"\nTeste Manual: \"{frase_teste}\" → {lbl} ({conf:.2f})")

frase_teste = "Men want to give birth sooooo bad it’s crazy"
lbl, conf = classificar_frase(frase_teste)
print(f"\nTeste Manual: \"{frase_teste}\" → {lbl} ({conf:.2f})")