In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, callbacks
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
# ────────────────────────────────────────────────────────────────────────────────
# 0) Hyperparameters & Constants
# ────────────────────────────────────────────────────────────────────────────────
MAX_VOCAB_SIZE    = 20000
MAX_SEQUENCE_LEN  = 200
EMBEDDING_DIM     = 300
LSTM_UNITS        = 64
BATCH_SIZE        = 64
EPOCHS            = 1
AUTOTUNE          = tf.data.AUTOTUNE
NUM_CLASSES       = 4
CLASS_NAMES       = ["World", "Sports", "Business", "Sci/Tech"]


In [26]:
# ────────────────────────────────────────────────────────────────────────────────
# 1) Load & preprocess AG News CSVs
# ────────────────────────────────────────────────────────────────────────────────
train_df = pd.read_csv("D:/AIML/data/ag_news_train.csv", header=None,
                       names=["label","title","description"])
test_df  = pd.read_csv("D:/AIML/data/ag_news_test.csv",  header=None,
                       names=["label","title","description"])
train_df["label"] -= 1
test_df["label"]  -= 1
train_df["text"] = train_df["title"].str.cat(train_df["description"], sep=" ")
test_df["text"]  = test_df["title"].str.cat(test_df["description"], sep=" ")


In [27]:
# ────────────────────────────────────────────────────────────────────────────────
# 2) Train/validation split
# ────────────────────────────────────────────────────────────────────────────────
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"].values,
    train_df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=train_df["label"].values
)
test_texts  = test_df["text"].values
test_labels = test_df["label"].values


In [28]:
# ────────────────────────────────────────────────────────────────────────────────
# 3) TextVectorization
# ────────────────────────────────────────────────────────────────────────────────
vectorizer = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LEN
)
vectorizer.adapt(train_texts)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    token_ids = vectorizer(text)
    return tf.squeeze(token_ids, axis=0), label

def make_dataset(texts, labels, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        ds = ds.shuffle(len(texts), seed=42)
    ds = ds.map(vectorize_text, num_parallel_calls=AUTOTUNE)
    return ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

train_ds = make_dataset(train_texts, train_labels, shuffle=True)
val_ds   = make_dataset(val_texts,   val_labels)
test_ds  = make_dataset(test_texts,  test_labels)

In [29]:
embeddings_index = {}
glovefile = open('D:/AIML/data/glove.42B.300d.txt','r',encoding='utf-8')
for line in tqdm(glovefile):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glovefile.close()

print('Found %s word vectors.' % len(embeddings_index))

1917494it [02:15, 14177.39it/s]


Found 1917494 word vectors.


In [30]:
""" 
embeddings_index["hello"] = array([0.1,0.2,0.3], dtype=float32)
{
  "hello": array([ 0.1,  0.2,  0.3], dtype=float32),
  "world": array([ 0.4,  0.5,  0.6], dtype=float32),
  "test":  array([-0.1, 0.0,  0.1], dtype=float32)
}

"""

' \nembeddings_index["hello"] = array([0.1,0.2,0.3], dtype=float32)\n{\n  "hello": array([ 0.1,  0.2,  0.3], dtype=float32),\n  "world": array([ 0.4,  0.5,  0.6], dtype=float32),\n  "test":  array([-0.1, 0.0,  0.1], dtype=float32)\n}\n\n'

In [35]:
# 1) Build the embedding matrix from your GloVe dict and vectorizer vocab
vocab = vectorizer.get_vocabulary()  # list length ≥ MAX_VOCAB_SIZE
vocab = vocab[:MAX_VOCAB_SIZE]       # truncate to exactly MAX_VOCAB_SIZE
embedding_matrix = np.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM), dtype="float32")

for idx, word in enumerate(vocab):
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[idx] = vec
    # else leave zeros (or add small random noise)

In [37]:
text_inputs = layers.Input(shape=(MAX_SEQUENCE_LEN,),name="input_tokens", dtype="int32")
embedding_layer = layers.Embedding(input_dim=MAX_VOCAB_SIZE, 
                                   output_dim=EMBEDDING_DIM,
                                   input_length=MAX_SEQUENCE_LEN, 
                                   weights=[embedding_matrix], 
                                   trainable=False,
                                   mask_zero =True)
embedded_sequence = embedding_layer(text_inputs)

# 3) Build the rest of the BiLSTM model
lstm1 = layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=True))(embedded_sequence)
lstm2 = layers.Bidirectional(layers.LSTM(LSTM_UNITS))(lstm1)
x     = layers.Dropout(0.5)(lstm2)
x     = layers.Dense(64, activation="relu")(x)
x     = layers.Dropout(0.5)(x)
out   = layers.Dense(NUM_CLASSES, activation="softmax")(x)

model = Model(inputs=text_inputs, outputs=out, name="bilstm_glove")


In [38]:

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
model.summary()

Model: "bilstm_glove"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_tokens (InputLayer)   [(None, 200)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 200, 300)          6000000   
                                                                 
 bidirectional (Bidirection  (None, 200, 128)          186880    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)               

In [39]:
# ────────────────────────────────────────────────────────────────────────────────
# 5) Train
# ────────────────────────────────────────────────────────────────────────────────
ckpt = callbacks.ModelCheckpoint(
    "D:/AIML/data/bilstm_tc_fun_glove.h5",
    monitor="val_accuracy",
    save_best_only=True
)
es = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[ckpt, es]
)



  saving_api.save_model(




<keras.src.callbacks.History at 0x1af5e8f1180>

In [40]:

# ────────────────────────────────────────────────────────────────────────────────
# 6) Evaluate
# ────────────────────────────────────────────────────────────────────────────────
loss, acc = model.evaluate(test_ds)
print(f"Test accuracy: {acc:.4f}")

Test accuracy: 0.9061
