In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, callbacks
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# ────────────────────────────────────────────────────────────────────────────────
# 0) Hyperparameters & Constants
# ────────────────────────────────────────────────────────────────────────────────
MAX_VOCAB_SIZE    = 20000
MAX_SEQUENCE_LEN  = 200
EMBEDDING_DIM     = 300
LSTM_UNITS        = 64
BATCH_SIZE        = 64
EPOCHS            = 1
AUTOTUNE          = tf.data.AUTOTUNE
NUM_CLASSES       = 4
CLASS_NAMES       = ["World", "Sports", "Business", "Sci/Tech"]


In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# 1) Load & preprocess AG News CSVs
# ────────────────────────────────────────────────────────────────────────────────
train_df = pd.read_csv("D:/AIML/data/ag_news_train.csv", header=None,
                       names=["label","title","description"])
test_df  = pd.read_csv("D:/AIML/data/ag_news_test.csv",  header=None,
                       names=["label","title","description"])
train_df["label"] -= 1
test_df["label"]  -= 1
train_df["text"] = train_df["title"].str.cat(train_df["description"], sep=" ")
test_df["text"]  = test_df["title"].str.cat(test_df["description"], sep=" ")


In [4]:
# ────────────────────────────────────────────────────────────────────────────────
# 2) Train/validation split
# ────────────────────────────────────────────────────────────────────────────────
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"].values,
    train_df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=train_df["label"].values
)
test_texts  = test_df["text"].values
test_labels = test_df["label"].values


In [5]:
# ────────────────────────────────────────────────────────────────────────────────
# 3) TextVectorization
# ────────────────────────────────────────────────────────────────────────────────
vectorizer = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LEN
)
vectorizer.adapt(train_texts)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    token_ids = vectorizer(text)
    return tf.squeeze(token_ids, axis=0), label

def make_dataset(texts, labels, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        ds = ds.shuffle(len(texts), seed=42)
    ds = ds.map(vectorize_text, num_parallel_calls=AUTOTUNE)
    return ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

train_ds = make_dataset(train_texts, train_labels, shuffle=True)
val_ds   = make_dataset(val_texts,   val_labels)
test_ds  = make_dataset(test_texts,  test_labels)

In [6]:
embeddings_index = {}
glovefile = open('D:/AIML/data/glove.42B.300d.txt','r',encoding='utf-8')
for line in tqdm(glovefile):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glovefile.close()

print('Found %s word vectors.' % len(embeddings_index))

1917494it [02:10, 14742.52it/s]

Found 1917494 word vectors.





In [7]:
# 1) Build the embedding matrix from your GloVe dict and vectorizer vocab
vocab = vectorizer.get_vocabulary()  # list length ≥ MAX_VOCAB_SIZE
vocab = vocab[:MAX_VOCAB_SIZE]       # truncate to exactly MAX_VOCAB_SIZE
embedding_matrix = np.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM), dtype="float32")

for idx, word in enumerate(vocab):
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[idx] = vec
    # else leave zeros (or add small random noise)

In [8]:
text_inputs = layers.Input(shape=(MAX_SEQUENCE_LEN,),name="input_tokens", dtype="int32")
embedding_layer = layers.Embedding(input_dim=MAX_VOCAB_SIZE, 
                                   output_dim=EMBEDDING_DIM,
                                   input_length=MAX_SEQUENCE_LEN, 
                                   weights=[embedding_matrix], 
                                   trainable=False,
                                   mask_zero =True)
positional_embedding_layer = layers.Embedding(input_dim=MAX_SEQUENCE_LEN, output_dim=EMBEDDING_DIM, trainable=True)
embedded_sequences = embedding_layer(text_inputs)

positions = tf.range(start=0, limit=MAX_SEQUENCE_LEN, delta=1)
positions = positional_embedding_layer(positions)

embedded_sequences = embedded_sequences + positions


num_heads = 2
ff_dim =256

attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=EMBEDDING_DIM)
attn_out = attn(embedded_sequences, embedded_sequences)
attn_out = layers.Dropout(0.5)(attn_out)
attn_out_f = layers.LayerNormalization(axis=-1)(embedded_sequences + attn_out)

ffn_out = layers.Dense(ff_dim, activation="relu")(attn_out_f)
ffn_out = layers.Dense(EMBEDDING_DIM)(ffn_out)
ffn_out = layers.Dropout(0.5)(ffn_out)
ffn_out_f = layers.LayerNormalization(axis=-1)(attn_out_f + ffn_out)

conv41 = layers.Conv1D(filters=128, kernel_size=16, activation="relu")(ffn_out_f)
pool41 = layers.MaxPooling1D()(conv41)
norm41 = layers.LayerNormalization(axis=-1)(pool41)
conv42 = layers.Conv1D(filters=256, kernel_size=16, activation="relu")(norm41)

trans_pool1 = layers.GlobalAveragePooling1D()(conv42)
trans_d = layers.Dense(256, activation="relu")(trans_pool1)
text_features = layers.Dropout(0.5, name="text_features")(trans_d)

text_features = layers.Dense(256, activation="relu")(text_features)
text_features = layers.LayerNormalization(axis=-1)(text_features)
text_out = layers.Dense(NUM_CLASSES, activation="softmax", name="text_out")(text_features)

model = Model(inputs=[text_inputs], outputs = [text_out] )




In [9]:

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_tokens (InputLayer)   [(None, 200)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 200, 300)             6000000   ['input_tokens[0][0]']        
                                                                                                  
 tf.math.add (TFOpLambda)    (None, 200, 300)             0         ['embedding[0][0]']           
                                                                                                  
 multi_head_attention (Mult  (None, 200, 300)             722100    ['tf.math.add[0][0]',         
 iHeadAttention)                                                     'tf.math.add[0][0]']     

In [10]:
# ────────────────────────────────────────────────────────────────────────────────
# 5) Train
# ────────────────────────────────────────────────────────────────────────────────
ckpt = callbacks.ModelCheckpoint(
    "D:/AIML/data/transformer_tc_fun_glove.h5",
    monitor="val_accuracy",
    save_best_only=True
)
es = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[ckpt, es]
)



  saving_api.save_model(




<keras.src.callbacks.History at 0x1fcbff1b400>

In [11]:

# ────────────────────────────────────────────────────────────────────────────────
# 6) Evaluate
# ────────────────────────────────────────────────────────────────────────────────
loss, acc = model.evaluate(test_ds)
print(f"Test accuracy: {acc:.4f}")

Test accuracy: 0.8997
