<a href="https://colab.research.google.com/github/KarAnalytics/code_demos/blob/main/SimpleTransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np


In [2]:
# 1. Load Real Data: IMDb Movie Reviews
# We limit to 10,000 words and 200 words per review for speed
vocab_size = 10000
maxlen = 200

(x_train, y_train), (x_val, y_val) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

# 2. Define the Transformer Encoder Block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = models.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=True):
        # Multi-Head Self-Attention
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) # Residual connection

        # Feed Forward Network
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) # Residual connection

# 3. Handle Token + Positional Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions # Adding position to meaning



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
# 4. Build the Final Classification Model
embed_dim = 32  # Embedding size for each token
num_heads = 2   # Number of attention heads
ff_dim = 32     # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x) # Summarize the sequence
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = models.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# 5. Train on Real Data
print("Training Transformer on IMDb Dataset...")
model.fit(x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val))

Training Transformer on IMDb Dataset...
Epoch 1/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.7302 - loss: 0.5112 - val_accuracy: 0.8817 - val_loss: 0.2837
Epoch 2/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9225 - loss: 0.2038 - val_accuracy: 0.8693 - val_loss: 0.3114


<keras.src.callbacks.history.History at 0x7a31c9860770>

In [4]:
# 4. Predict on your Specific Test Reviews
test_reviews = [
    "this movie was an absolute masterpiece with brilliant acting",
    "i hated every minute of this film the plot was a total disaster"
]

# We must use the same word index mapping used for training
word_index = tf.keras.datasets.imdb.get_word_index()

def preprocess_text(texts):
    encoded_texts = []
    for text in texts:
        # Convert words to IMDb indices
        tokens = text.lower().split()
        sequence = [word_index.get(word, 0) + 3 for word in tokens] # +3 is an IMDb dataset quirk
        encoded_texts.append(sequence)
    return tf.keras.preprocessing.sequence.pad_sequences(encoded_texts, maxlen=maxlen)

X_test = preprocess_text(test_reviews)
predictions = model.predict(X_test)

# 5. Output Results
print("\n--- Manual Transformer Results ---")
for i, review in enumerate(test_reviews):
    sentiment = "Positive" if predictions[i][1] > 0.5 else "Negative"
    print(f"Review: {review}")
    print(f"Prediction: {sentiment} ({predictions[i][0]:.4f})\n")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step

--- Manual Transformer Results ---
Review: this movie was an absolute masterpiece with brilliant acting
Prediction: Positive (0.0042)

Review: i hated every minute of this film the plot was a total disaster
Prediction: Negative (0.9442)



In [None]:
import tensorflow as tf

# 1. Load the dataset (limiting to 10,000 most frequent words)
(x_train, y_train), _ = tf.keras.datasets.imdb.load_data(num_words=10000)

# 2. Get the word index (dictionary)
word_index = tf.keras.datasets.imdb.get_word_index()

# 3. Create a reverse word index to map integers back to words
# We shift by 3 because 0, 1, and 2 are reserved for <PAD>, <START>, and <UNK>
reverse_word_index = {value + 3: key for (key, value) in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"
reverse_word_index[3] = "<UNUSED>"

def decode_review(text_ids):
    return ' '.join([reverse_word_index.get(i, '?') for i in text_ids])

# Read and print the 5th to 10th records
# Python uses 0-based indexing, so the 5th record is index 4, and the 10th is index 9.
# The range function is exclusive of the stop value, so range(4, 10) will include indices 4, 5, 6, 7, 8, 9.
print("Querying records from 5th to 10th:")
for i in range(4, 10):
    print(f"--- Record {i+1} ---") # Displaying record number from 1-based perspective
    print(f"Label: {y_train[i]} (1 = Positive, 0 = Negative)")
    print(f"Text: {decode_review(x_train[i][:50])}...") # Printing first 50 words
    print("\n")

In [5]:
### Why is the label inversion happening? NOT SURE

# 1. Load the dataset (limiting to 10,000 most frequent words)
(x_train, y_train), _ = tf.keras.datasets.imdb.load_data(num_words=10000)

# 2. Get the word index (dictionary)
word_index = tf.keras.datasets.imdb.get_word_index()

# 3. Create a reverse word index to map integers back to words
# We shift by 3 because 0, 1, and 2 are reserved for <PAD>, <START>, and <UNK>
reverse_word_index = {value + 3: key for (key, value) in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"
reverse_word_index[3] = "<UNUSED>"

def decode_review(text_ids):
    return ' '.join([reverse_word_index.get(i, '?') for i in text_ids])

# 4. Read and print few sample records
# for i in range(2):
for i in range(4,10):
    print(f"--- Record {i} ---")
    print(f"Label: {y_train[i]} (1 = Positive, 0 = Negative)")
    print(f"Text: {decode_review(x_train[i][:50])}...") # Printing first 50 words
    print("\n")

--- Record 4 ---
Label: 0 (1 = Positive, 0 = Negative)
Text: <START> worst mistake of my life br br i picked this movie up at target for 5 because i figured hey it's sandler i can get some cheap laughs i was wrong completely wrong mid way through the film all three of my friends were asleep and i was still...


--- Record 5 ---
Label: 0 (1 = Positive, 0 = Negative)
Text: <START> begins better than it ends funny that the russian submarine crew <UNK> all other actors it's like those scenes where documentary shots br br spoiler part the message <UNK> was contrary to the whole story it just does not <UNK> br br...


--- Record 6 ---
Label: 1 (1 = Positive, 0 = Negative)
Text: <START> lavish production values and solid performances in this straightforward adaption of jane <UNK> satirical classic about the marriage game within and between the classes in <UNK> 18th century england northam and paltrow are a <UNK> mixture as friends who must pass through <UNK> and lies to discover that they...


In [None]:
### Let's do it the easier way with BERT (Note that there is no training involved here)

from transformers import pipeline

# 1. The Single-Line Model: Load a pre-trained Transformer (DistilBERT)
# This handles tokenization, encoding, and the classification head automatically.
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# 2. Test it on real data (Supervised Learning Inference)
test_reviews = [
    "This movie was an absolute masterpiece with brilliant acting.",
    "I hated every minute of this film; the plot was a total disaster."
]

results = classifier(test_reviews)

# 3. Print the results
for review, result in zip(test_reviews, results):
    print(f"Review: {review}")
    print(f"Result: {result['label']} (Confidence: {result['score']:.4f})\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



Review: This movie was an absolute masterpiece with brilliant acting.
Result: POSITIVE (Confidence: 0.9999)

Review: I hated every minute of this film; the plot was a total disaster.
Result: NEGATIVE (Confidence: 0.9998)

