In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, callbacks
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# --- Hyperparameters ---
MAX_SEQUENCE_LEN = 200
MAX_VOCAB_SIZE = 20000
NUM_CLASSES = 4
NUM_HEADS = 4
FF_DIM = 256
DROPOUT_RATE = 0.1

EMBEDDING_DIM     = 300
LSTM_UNITS        = 64
BATCH_SIZE        = 64
EPOCHS            = 1
AUTOTUNE          = tf.data.AUTOTUNE

CLASS_NAMES       = ["World", "Sports", "Business", "Sci/Tech"]

In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# 1) Load & preprocess AG News CSVs
# ────────────────────────────────────────────────────────────────────────────────
train_df = pd.read_csv("D:/AIML/data/ag_news_train.csv", header=None,
                       names=["label","title","description"])
test_df  = pd.read_csv("D:/AIML/data/ag_news_test.csv",  header=None,
                       names=["label","title","description"])
train_df["label"] -= 1
test_df["label"]  -= 1
train_df["text"] = train_df["title"].str.cat(train_df["description"], sep=" ")
test_df["text"]  = test_df["title"].str.cat(test_df["description"], sep=" ")


In [4]:
# ────────────────────────────────────────────────────────────────────────────────
# 2) Train/validation split
# ────────────────────────────────────────────────────────────────────────────────
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"].values,
    train_df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=train_df["label"].values
)
test_texts  = test_df["text"].values
test_labels = test_df["label"].values


In [5]:
# ────────────────────────────────────────────────────────────────────────────────
# 3) TextVectorization
# ────────────────────────────────────────────────────────────────────────────────
vectorizer = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LEN
)
vectorizer.adapt(train_texts)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    token_ids = vectorizer(text)
    return tf.squeeze(token_ids, axis=0), label

def make_dataset(texts, labels, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        ds = ds.shuffle(len(texts), seed=42)
    ds = ds.map(vectorize_text, num_parallel_calls=AUTOTUNE)
    return ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

train_ds = make_dataset(train_texts, train_labels, shuffle=True)
val_ds   = make_dataset(val_texts,   val_labels)
test_ds  = make_dataset(test_texts,  test_labels)

In [6]:
embeddings_index = {}
glovefile = open('D:/AIML/data/glove.42B.300d.txt','r',encoding='utf-8')
for line in tqdm(glovefile):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glovefile.close()

print('Found %s word vectors.' % len(embeddings_index))

1917494it [02:10, 14637.88it/s]

Found 1917494 word vectors.





In [7]:
# 1) Build the embedding matrix from your GloVe dict and vectorizer vocab
vocab = vectorizer.get_vocabulary()  # list length ≥ MAX_VOCAB_SIZE
vocab = vocab[:MAX_VOCAB_SIZE]       # truncate to exactly MAX_VOCAB_SIZE
embedding_matrix = np.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM), dtype="float32")

for idx, word in enumerate(vocab):
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[idx] = vec
    # else leave zeros (or add small random noise)

In [8]:
# --- Inputs ---
text_inputs = layers.Input(shape=(MAX_SEQUENCE_LEN,), dtype=tf.int32, name="input_tokens")

# --- Embedding Layer (Token Embedding + Positional Embedding) ---
# 1. Token embedding
token_embedding = layers.Embedding(
    input_dim=MAX_VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    mask_zero=True,
    name="token_embedding"
)(text_inputs)

# 2. Positional embedding
positions = tf.range(start=0, limit=MAX_SEQUENCE_LEN, delta=1)
positions = layers.Embedding(
    input_dim=MAX_SEQUENCE_LEN,
    output_dim=EMBEDDING_DIM,
    name="position_embedding"
)(positions)
positions = tf.expand_dims(positions, axis=0)

# 3. Add token + position embeddings
x = token_embedding + positions

# --- Transformer Encoder Block ---
# 4. Multi-head self-attention
attention_output = layers.MultiHeadAttention(
    num_heads=NUM_HEADS,
    key_dim=EMBEDDING_DIM,
    dropout=DROPOUT_RATE,
    name="multihead_attention"
)(x, x)

# 5. Add & Norm
x = layers.Add(name="skip_connection_1")([x, attention_output])
x = layers.LayerNormalization(epsilon=1e-6, name="layer_norm_1")(x)

# 6. Feed Forward Network
ffn = layers.Dense(FF_DIM, activation="relu", name="ffn_1")(x)
ffn = layers.Dense(EMBEDDING_DIM, name="ffn_2")(ffn)
ffn = layers.Dropout(DROPOUT_RATE)(ffn)

# 7. Add & Norm
x = layers.Add(name="skip_connection_2")([x, ffn])
x = layers.LayerNormalization(epsilon=1e-6, name="layer_norm_2")(x)

# --- Output Head ---
# 8. Global pooling
x = layers.GlobalAveragePooling1D(name="global_avg_pool")(x)

# 9. Classification head
x = layers.Dropout(0.3, name="final_dropout")(x)
x = layers.Dense(128, activation="relu", name="dense_relu")(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(NUM_CLASSES, activation="softmax", name="classifier")(x)

# --- Final Model ---
pure_transformer_model = Model(inputs=text_inputs, outputs=outputs, name="pure_transformer_text_classifier")

pure_transformer_model.summary()


Model: "pure_transformer_text_classifier"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_tokens (InputLayer)   [(None, 200)]                0         []                            
                                                                                                  
 token_embedding (Embedding  (None, 200, 300)             6000000   ['input_tokens[0][0]']        
 )                                                                                                
                                                                                                  
 tf.math.add (TFOpLambda)    (None, 200, 300)             0         ['token_embedding[0][0]']     
                                                                                                  
 multihead_attention (Multi  (None, 200, 300)             1443900  

In [9]:
model = pure_transformer_model

In [10]:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [11]:
# ────────────────────────────────────────────────────────────────────────────────
# 5) Train
# ────────────────────────────────────────────────────────────────────────────────
ckpt = callbacks.ModelCheckpoint(
    "D:/AIML/data/transformer_tc_fun.h5",
    monitor="val_accuracy",
    save_best_only=True
)
es = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[ckpt, es]
)



  saving_api.save_model(




<keras.src.callbacks.History at 0x2be2f86ad10>

In [12]:

# ────────────────────────────────────────────────────────────────────────────────
# 6) Evaluate
# ────────────────────────────────────────────────────────────────────────────────
loss, acc = model.evaluate(test_ds)
print(f"Test accuracy: {acc:.4f}")

Test accuracy: 0.2500
