In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Load Dataset
df = pd.read_csv("03_Cleaned_Initial_Dataset.csv")

# Assign the text and label columns
text_col = "cleaned_text"  
label_col = "label_encoded" 

# Ensure text is string type and handle missing values
df[text_col] = df[text_col].astype(str).fillna("")

y = df[label_col]
X = df[text_col]

# Split Dataset into 80% Training and 20% Testing Sets
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Further Split 10% Validation from Training Set
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.125, random_state=42, stratify=y_train_full
)

print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")

# Tokenize and Pad Sequences
max_words = 10000
max_len = 100

custom_filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n' #exlcude <>

tokenizer = Tokenizer(
    num_words=max_words, 
    oov_token="<OOV>",
    filters=custom_filters # to ensure <phone> <email> <url> are not filtered out
)

tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='pre', truncating='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=max_len, padding='pre', truncating='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=max_len, padding='pre', truncating='post')

Train size: 4024, Val size: 575, Test size: 1150


In [4]:
weights = class_weight.compute_class_weight(
    class_weight='balanced', 
    classes=np.unique(y_train), 
    y=y_train
)

weights_dict = dict(enumerate(weights))
print(f"Class Weights: {weights_dict}")

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Build Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

Class Weights: {0: np.float64(0.5954424385912992), 1: np.float64(3.1193798449612404)}




In [5]:
print("\nStarting Training...")

history = model.fit(
    X_train_pad, y_train,
    epochs=10,              
    batch_size=32,
    validation_data=(X_val_pad, y_val), # Validation on separate validation set
    class_weight=weights_dict, # Applies class weights to handle imbalance
    callbacks=[early_stop]     # Early stopping to prevent overfitting
)

print("\nTraining Complete.")


Starting Training...
Epoch 1/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 88ms/step - accuracy: 0.6798 - loss: 0.4667 - val_accuracy: 0.9878 - val_loss: 0.0417
Epoch 2/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 80ms/step - accuracy: 0.9881 - loss: 0.0532 - val_accuracy: 0.9774 - val_loss: 0.0735
Epoch 3/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 79ms/step - accuracy: 0.9955 - loss: 0.0176 - val_accuracy: 0.9913 - val_loss: 0.0419

Training Complete.


In [6]:
# Evaluate on Validation Set
val_loss, val_accuracy = model.evaluate(X_val_pad, y_val, verbose=0)
print(f"Validation Set -> Loss: {val_loss:.4f} | Accuracy: {val_accuracy:.4f}")

# Evaluate on Test Set
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"Test Set       -> Loss: {test_loss:.4f} | Accuracy: {test_accuracy:.4f}")

# Predict probabilities
y_pred_prob = model.predict(X_test_pad)

# Convert probabilities to binary labels (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Validation Set -> Loss: 0.0417 | Accuracy: 0.9878
Test Set       -> Loss: 0.0548 | Accuracy: 0.9817
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step

Confusion Matrix:
[[964   2]
 [ 19 165]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.90      0.94       184

    accuracy                           0.98      1150
   macro avg       0.98      0.95      0.96      1150
weighted avg       0.98      0.98      0.98      1150

