In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# Refactored Code: Text Classification using Pretrained BERT
# Dataset: AG News (4 classes)
# ─────────────────────────────────────────────────────────────────────────────

# Install transformers if needed
# pip install transformers

import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import layers, models, callbacks

In [5]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) Load Data (Assume you already have train.csv and test.csv)
# ─────────────────────────────────────────────────────────────────────────────
train_df = pd.read_csv("D:/AIML/data/ag_news_train.csv", header=None, names=["label", "title", "description"])
test_df  = pd.read_csv("D:/AIML/data/ag_news_test.csv", header=None, names=["label", "title", "description"])

# Zero-base the labels (1-4 → 0-3)
train_df["label"] -= 1
test_df["label"]  -= 1

# Combine title + description
train_df["text"] = train_df["title"] + " " + train_df["description"]
test_df["text"]  = test_df["title"] + " " + test_df["description"]

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()
test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

In [6]:
# ─────────────────────────────────────────────────────────────────────────────
# 2) Setup Hyperparameters
# ─────────────────────────────────────────────────────────────────────────────
BERT_MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
NUM_CLASSES = 4
AUTOTUNE = tf.data.AUTOTUNE


In [7]:
# ─────────────────────────────────────────────────────────────────────────────
# 3) Tokenization using BERT Tokenizer
# ─────────────────────────────────────────────────────────────────────────────
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

def encode_texts(texts):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='tf'
    )

train_encodings = encode_texts(train_texts)
test_encodings = encode_texts(test_texts)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# ─────────────────────────────────────────────────────────────────────────────
# 4) Create tf.data.Dataset
# ─────────────────────────────────────────────────────────────────────────────
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(train_texts)).batch(BATCH_SIZE).prefetch(AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [9]:
# ─────────────────────────────────────────────────────────────────────────────
# 5) Build Model with BERT Encoder
# ─────────────────────────────────────────────────────────────────────────────
bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME)

# Freeze BERT layers (optional for faster training)
bert_model.trainable = False

# Input layers
input_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

# BERT output
bert_outputs = bert_model(
    input_ids,
    attention_mask=attention_mask
)

# Take [CLS] token output (pooled output)
pooled_output = bert_outputs.pooler_output

# Classification head
x = layers.Dense(128, activation='relu')(pooled_output)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(NUM_CLASSES, activation='softmax')(x)

# Final model
model = models.Model(inputs=[input_ids, attention_mask], outputs=outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                       

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 6) Train
# ─────────────────────────────────────────────────────────────────────────────
ckpt = callbacks.ModelCheckpoint("D:/AIML/data/bert_tc_fun.h5", save_best_only=True, monitor="val_accuracy")
es = callbacks.EarlyStopping(patience=2, restore_best_weights=True)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS,
    callbacks=[ckpt, es]
)


Epoch 1/3


  inputs = self._flatten_to_reference_inputs(inputs)


 153/3750 [>.............................] - ETA: 100:20:52 - loss: 1.5262 - accuracy: 0.2582

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 7) Evaluate
# ─────────────────────────────────────────────────────────────────────────────
test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_acc:.4f}")

In [None]:
# 8) Sample Predictions
# ─────────────────────────────────────────────────────────────────────────────
sample_texts = [
    "SpaceX launches new batch of Starlink satellites.",
    "The stock market crashed due to inflation fears.",
    "Manchester United wins against Liverpool in thriller match.",
    "Scientists discover a new particle at CERN."
]

sample_encodings = encode_texts(sample_texts)
sample_preds = model.predict(dict(sample_encodings))

for text, probs in zip(sample_texts, sample_preds):
    pred_class = np.argmax(probs)
    print(f"Text: {text[:50]}... Predicted Class: {pred_class} (Conf: {probs[pred_class]:.2%})")
