In [13]:
# ================================================
# BiLSTM + fastText Bangla (cc.bn.300.vec) for Genre from Summary
# ================================================
import os, re, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# -----------------------
# Config
# -----------------------
TRAIN_CSV = "Dataset_60_20_20/train.csv"
VAL_CSV   = "Dataset_60_20_20/validation.csv"
TEST_CSV  = "Dataset_60_20_20/test.csv"

# fastText Bangla vectors (TEXT .vec file)
EMBED_FILE = "cc.bn.300.vec"  # <-- put the path to your file here
EMBED_DIM  = 300              # must match the file

MAX_WORDS  = 100_000          # cap vocab size for tokenizer
MAX_LEN    = 300              # sequence length (pad/truncate)
BATCH_SIZE = 64
EPOCHS     = 15
SEED       = 42

np.random.seed(SEED)
tf.random.set_seed(SEED)


In [14]:

# -----------------------
# Load data
# -----------------------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

for d in (train_df, val_df, test_df):
    d["Summary"] = d["Summary"].astype(str)
    d["Genre"]   = d["Genre"].astype(str)

X_train, y_train = train_df["Summary"], train_df["Genre"]
X_val,   y_val   = val_df["Summary"],   val_df["Genre"]
X_test,  y_test  = test_df["Summary"],  test_df["Genre"]


In [15]:

# -----------------------
# Labels
# -----------------------
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))


Classes: ['Adventure', 'Biography and Autobiography', 'Classic Novel', 'Classic Story', 'Contemporary Novel', 'Contemporary Story', 'Cooking, Food and Nutrition', 'History and Tradition', 'Math', 'Mystery', 'Philosophy', 'Politics', 'Religious', 'Sciene Fiction', 'Shishu Kishor', 'Thriller']


In [16]:

# -----------------------
# Tokenization (Bangla-friendly)
# -----------------------
# Keep case / avoid stripping Bangla punctuation inconsistently
tokenizer = Tokenizer(num_words=MAX_WORDS, lower=False, filters='')
tokenizer.fit_on_texts(X_train.tolist() + X_val.tolist())

def to_seq(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")

Xtr_seq  = to_seq(X_train)
Xval_seq = to_seq(X_val)
Xte_seq  = to_seq(X_test)

word_index = tokenizer.word_index
vocab_size = min(MAX_WORDS, len(word_index) + 1)
print("Vocab size (capped):", vocab_size)


Vocab size (capped): 100000


In [17]:

# -----------------------
# Load fastText .vec embeddings
# -----------------------
def load_embeddings_txt(path, embed_dim):
    """
    Load text embeddings (word + EMBED_DIM floats per line).
    Works for fastText .vec and GloVe-like text files.
    Skips header lines that don't have enough fields.
    """
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.rstrip().split(' ')
            # fastText .vec often has a header line "<vocab> <dim>"
            if len(parts) < embed_dim + 1:
                continue
            word = parts[0]
            try:
                vec = np.asarray(parts[1:1+embed_dim], dtype='float32')
            except ValueError:
                continue
            embeddings_index[word] = vec
    return embeddings_index

print("Loading embeddings from:", EMBED_FILE)
embeddings_index = load_embeddings_txt(EMBED_FILE, EMBED_DIM)
print("Embeddings loaded:", len(embeddings_index))

# Build embedding matrix
embedding_matrix = np.random.normal(0.0, 0.05, size=(vocab_size, EMBED_DIM)).astype('float32')
hit = 0
for w, i in word_index.items():
    if i >= vocab_size:
        continue
    vec = embeddings_index.get(w)
    if vec is not None and len(vec) == EMBED_DIM:
        embedding_matrix[i] = vec
        hit += 1
print(f"Initialized from pre-trained: {hit}/{vocab_size} = {hit/vocab_size:.2%}")


Loading embeddings from: cc.bn.300.vec
Embeddings loaded: 1468578
Initialized from pre-trained: 61834/100000 = 61.83%


In [18]:

# -----------------------
# (Optional) Use fastText .bin for OOV coverage via subwords
# -----------------------
# from gensim.models.fasttext import load_facebook_vectors  # pip install gensim
# FT_BIN_FILE = "cc.bn.300.bin"
# def load_ft_bin_to_matrix(bin_path, word_index, vocab_size, embed_dim):
#     ft = load_facebook_vectors(bin_path)
#     emb = np.random.normal(0.0, 0.05, size=(vocab_size, embed_dim)).astype('float32')
#     hit = 0
#     for w, i in word_index.items():
#         if i >= vocab_size: 
#             continue
#         try:
#             emb[i] = ft.get_vector(w)  # subword composition for OOV too
#             hit += 1
#         except KeyError:
#             pass
#     print(f"Initialized from fastText-bin: {hit}/{vocab_size} = {hit/vocab_size:.2%}")
#     return emb
# # To use .bin instead of .vec, uncomment:
# # embedding_matrix = load_ft_bin_to_matrix(FT_BIN_FILE, word_index, vocab_size, EMBED_DIM)


In [19]:

# -----------------------
# Build model
# -----------------------
def build_model(vocab_size, embed_dim, max_len, embedding_matrix, num_classes):
    model = Sequential()
    model.add(Embedding(
        input_dim=vocab_size,
        output_dim=embed_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False  # set True to fine-tune embeddings
    ))
    model.add(Bidirectional(LSTM(128, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(1e-3),
        metrics=['accuracy']
    )
    return model

model = build_model(vocab_size, EMBED_DIM, MAX_LEN, embedding_matrix, num_classes)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          30000000  
                                                                 
 bidirectional (Bidirection  (None, 256)               439296    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 batch_normalization (Batch  (None, 256)               1024      
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0

In [None]:

# -----------------------
# Train
# -----------------------
ckpt_path = "best_lstm_fasttext_bn.h5"
callbacks = [
    EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5, verbose=1),
    ModelCheckpoint(ckpt_path, monitor="val_accuracy", save_best_only=True, verbose=1)
]

history = model.fit(
    Xtr_seq, y_train_enc,
    validation_data=(Xval_seq, y_val_enc),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/15
Epoch 1: val_accuracy improved from -inf to 0.41148, saving model to best_lstm_fasttext_bn.h5


  saving_api.save_model(


Epoch 2/15
Epoch 2: val_accuracy improved from 0.41148 to 0.50067, saving model to best_lstm_fasttext_bn.h5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.50067 to 0.51570, saving model to best_lstm_fasttext_bn.h5
Epoch 4/15

In [None]:

# -----------------------
# Evaluate on TEST
# -----------------------
y_prob = model.predict(Xte_seq, batch_size=BATCH_SIZE)
y_pred = y_prob.argmax(axis=1)

acc = accuracy_score(y_test_enc, y_pred)
print("\nTEST Accuracy:", f"{acc:.4f}")
print("\nClassification Report (TEST):")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))


In [None]:

# Confusion matrix
cm = confusion_matrix(y_test_enc, y_pred)
fig, ax = plt.subplots(figsize=(10,8))
im = ax.imshow(cm, cmap="Blues")
ax.set_title("Confusion Matrix - BiLSTM (fastText Bangla)", fontsize=14, fontweight="bold")
ax.set_xlabel("Predicted"); ax.set_ylabel("True")
ax.set_xticks(np.arange(num_classes)); ax.set_yticks(np.arange(num_classes))
ax.set_xticklabels(le.classes_, rotation=45, ha='right'); ax.set_yticklabels(le.classes_)
for i in range(num_classes):
    for j in range(num_classes):
        ax.text(j, i, cm[i, j], ha='center', va='center', fontsize=8)
fig.colorbar(im, fraction=0.046, pad=0.04)
plt.tight_layout(); plt.show()


In [None]:

# Training curves (optional)
plt.figure(figsize=(10,4))
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Accuracy'); plt.legend(); plt.show()

plt.figure(figsize=(10,4))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss'); plt.legend(); plt.show()
