In [7]:
# ────────────────────────────────────────────────────────────────────────────────
# 1) Imports & Constants
# ────────────────────────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.model_selection import train_test_split

In [8]:
# Configurable hyperparameters
MAX_VOCAB_SIZE    = 20_000    # keep top 20K tokens
MAX_SEQUENCE_LEN  = 200       # pad or truncate all sequences to length 200
EMBEDDING_DIM     = 128       # dimension of embedding vectors
LSTM_UNITS        = 64        # number of units in LSTM layer
BATCH_SIZE        = 64
EPOCHS            = 10
AUTOTUNE          = tf.data.AUTOTUNE
NUM_CLASSES       = 4         # AG News has 4 categories

CLASS_NAMES = ["World", "Sports", "Business", "Sci/Tech"]

In [10]:
# ────────────────────────────────────────────────────────────────────────────────
# 2) Load CSVs into Pandas
# ────────────────────────────────────────────────────────────────────────────────
# Files downloaded via get_file or otherwise placed in working dir
train_df = pd.read_csv("D:/AIML/data/ag_news_train.csv", header=None, names=["label","title","description"])
test_df  = pd.read_csv("D:/AIML/data/ag_news_test.csv",  header=None, names=["label","title","description"])

# Zero-base the labels (1→0, 2→1, …, 4→3)
train_df["label"] -= 1
test_df["label"]  -= 1

# Combine title + description into a single text column
train_df["text"] = train_df["title"].str.cat(train_df["description"], sep=" ")
test_df["text"]  = test_df["title"].str.cat(test_df["description"], sep=" ")

In [22]:
train_df.head(3)

Unnamed: 0,label,title,description,text
0,2,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...


In [14]:
train_df.shape

(120000, 4)

In [16]:
# ────────────────────────────────────────────────────────────────────────────────
# 3) Split train → (train, validation)
# ────────────────────────────────────────────────────────────────────────────────
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"].values,
    train_df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=train_df["label"].values
)

# Prepare test set
test_texts  = test_df["text"].values
test_labels = test_df["label"].values

print(train_texts.shape, train_labels.shape)
print(val_texts.shape, val_labels.shape)
print(test_texts.shape, test_labels.shape)

(96000,) (96000,)
(24000,) (24000,)
(7600,) (7600,)


In [17]:
# ────────────────────────────────────────────────────────────────────────────────
# 4) Text Vectorization Layer
# ────────────────────────────────────────────────────────────────────────────────
vectorizer = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LEN
)
# Adapt to training texts only
vectorizer.adapt(train_texts)

""" 
# 3) Inspect the first 10 entries of the built vocabulary
vocab = vectorizer.get_vocabulary()
print("Vocabulary (top 10):", vocab[:10])

# 4) Turn each sample text into its integer sequence
vecs = vectorizer(sample_texts).numpy()
for text, seq in zip(sample_texts, vecs):
    print(f"\"{text}\" → {seq}")

Vocabulary (top 10): ['', '[UNK]', 'the', 'was', 'movie', 'film', 'great', 'hated', 'plot', 'boring']

"the movie was great"  → [2 4 3 6 0 0]
"I hated the film"      → [1 7 2 5 0 0]
"the plot was boring"   → [2 8 3 9 0 0]
"""


def vectorize_text(text, label):
    # 1) Add a “batch” dimension so vectorizer can process it:
    #    text is a scalar tf.Tensor of dtype string, e.g. tf.Tensor(b"hello world", shape=(), dtype=string)
    text = tf.expand_dims(text, -1)
    #    now it’s tf.Tensor([b"hello world"], shape=(1,), dtype=string)

    # 2) Run through the TextVectorization layer:
    #    produces shape (1, MAX_SEQUENCE_LEN) of int32 token IDs
    token_ids = vectorizer(text)

    # 3) Remove that extra batch dimension:
    #    squeeze along axis 0 turns it into a 1-D tensor of length MAX_SEQUENCE_LEN
    return tf.squeeze(token_ids, axis=0), label

""" 
text_input = tf.constant("hello unknown world")
label_input = tf.constant(1)   # e.g. class 1

[b"hello unknown world"]  shape=(1,)  

pads to length 6 → [2,1,3,0,0,0]

#([2,1,3,0,0,0], 1) ready for batching into a tf.data.Dataset.


"""



' \ntext_input = tf.constant("hello unknown world")\nlabel_input = tf.constant(1)   # e.g. class 1\n\n[b"hello unknown world"]  shape=(1,)  \n\n#([2,1,3,0,0,0], 1)\n\n\n'

In [18]:
# ────────────────────────────────────────────────────────────────────────────────
# 5) Build tf.data Pipelines
# ────────────────────────────────────────────────────────────────────────────────
def make_dataset(texts, labels, shuffle=False):
    # 1) Create a Dataset of individual (text, label) pairs
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    # 2) Optionally shuffle the entries
    if shuffle:
        ds = ds.shuffle(buffer_size=len(texts), seed=42)
    # 3) Convert each raw string → integer sequence, keep the label
    ds = ds.map(vectorize_text, num_parallel_calls=AUTOTUNE)
    # 4) Group into batches of BATCH_SIZE and prefetch for performance
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_dataset(train_texts, train_labels, shuffle=True)
val_ds   = make_dataset(val_texts,   val_labels)
test_ds  = make_dataset(test_texts,  test_labels)

In [21]:
""" 
texts  = ["hello world", "goodbye world"]
labels = [0, 1]   # two classes
BATCH_SIZE = 2
MAX_SEQUENCE_LEN = 4  # for simplicity

and assume our vectorize_text will map
"hello world" → [2,3,0,0]

"goodbye world" → [4,3,0,0]

1.from_tensor_slices
("hello world", 0)
("goodbye world", 1)

2.map(vectorize_text)
([2,3,0,0], 0)
([4,3,0,0], 1)

3. batch(2)
Groups both elements into a single batch:
sequences become a tensor of shape (2, 4)
[[2,3,0,0],
 [4,3,0,0]]

labels become a tensor of shape (2,)
[0, 1]

4. prefetch(AUTOTUNE)
Overlaps data-preparation with model training for maximum throughput

Token batch:
 [[2 3 0 0]
  [4 3 0 0]]
Token batch shape: (2, 4)

Label batch:
 [0 1]
Label batch shape: (2,)

"""
# Take one batch from the dataset
for token_batch, label_batch in train_ds.take(1):
    print("Token batch shape:", token_batch.shape)
    print("Label batch shape:", label_batch.shape)


Token batch shape: (64, 200)
Label batch shape: (64,)


In [23]:
# ────────────────────────────────────────────────────────────────────────────────
# 6) Define Embedding + BiLSTM Model
# ────────────────────────────────────────────────────────────────────────────────
model = models.Sequential([
    layers.Embedding(
        input_dim=MAX_VOCAB_SIZE,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_SEQUENCE_LEN,
        mask_zero=True 
    ), #Input → shape (batch, MAX_SEQUENCE_LEN), Output → (batch, MAX_SEQUENCE_LEN, EMBEDDING_DIM)
    layers.Bidirectional(layers.LSTM(LSTM_UNITS)),
    layers.Dropout(0.5),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(NUM_CLASSES, activation="softmax")
])

""" 
Input: integer sequences (batch, MAX_SEQUENCE_LEN)

Embedding: → (batch, MAX_SEQUENCE_LEN, EMBEDDING_DIM)

BiLSTM: → (batch, LSTM_UNITS*2)

Dense+Dropout: → (batch, 64)

Output: (batch, NUM_CLASSES)
"""

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          2560000   
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                        

In [None]:
# ────────────────────────────────────────────────────────────────────────────────
# 7) Callbacks
# ────────────────────────────────────────────────────────────────────────────────
checkpoint_cb = callbacks.ModelCheckpoint(
    "D:/AIML/data/best_agnews_bilstm.h5",
    monitor="val_accuracy",
    save_best_only=True
)
earlystop_cb  = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

In [25]:
# ────────────────────────────────────────────────────────────────────────────────
# 8) Train
# ────────────────────────────────────────────────────────────────────────────────
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[checkpoint_cb, earlystop_cb]
) #96 000 no of training samples / 64 batch size = 1500 batches in each epoch

Epoch 1/10

  saving_api.save_model(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 101/1500 [=>............................] - ETA: 6:48 - loss: 0.0646 - accuracy: 0.9769

KeyboardInterrupt: 

In [26]:
# ────────────────────────────────────────────────────────────────────────────────
# 9) Evaluate on Test Set
# ────────────────────────────────────────────────────────────────────────────────
test_loss, test_acc = model.evaluate(test_ds)
print(f"Test accuracy: {test_acc:.4f}")

Test accuracy: 0.9084


In [27]:
# ────────────────────────────────────────────────────────────────────────────────
# 10) Demo Predictions
# ────────────────────────────────────────────────────────────────────────────────
for sample_text, true_label in zip(test_texts[:5], test_labels[:5]):
    token_ids = vectorizer(tf.constant([sample_text]))
    probs     = model.predict(token_ids)[0]
    pred      = tf.argmax(probs).numpy()
    print("Text:", sample_text[:100], "…")
    print(f"True: {CLASS_NAMES[true_label]} | Pred: {CLASS_NAMES[pred]} (conf {probs[pred]:.2%})\n")


Text: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disap …
True: Business | Pred: Business (conf 76.20%)

Text: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - T …
True: Sci/Tech | Pred: Sci/Tech (conf 100.00%)

Text: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at th …
True: Sci/Tech | Pred: Sci/Tech (conf 99.85%)

Text: Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his …
True: Sci/Tech | Pred: World (conf 50.27%)

Text: Calif. Aims to Limit Farm-Related Smog (AP) AP - Southern California's smog-fighting agency went aft …
True: Sci/Tech | Pred: Sci/Tech (conf 99.93%)



In [28]:
# 10) Save the TextVectorization vocabulary
# ────────────────────────────────────────────────────────────────────────────────
vocab = vectorizer.get_vocabulary()
with open("D:/AIML/data/agnews_vocab.txt", "w", encoding="utf8") as f:
    for token in vocab:
        f.write(token + "\n")
print(f"✅ Vocabulary ({len(vocab)} tokens) saved to agnews_vocab.txt")

✅ Vocabulary (20000 tokens) saved to agnews_vocab.txt
