In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

# Load Dataset
df = pd.read_csv('C:\\Users\\hp\\Desktop\\IITK AI & ML\\AI\\Project3Spamfilter\\train.csv')
df = df[['question_text', 'target']]

# Tokenization
MAX_WORDS = 20000  
MAX_LEN = 50        

tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True)
tokenizer.fit_on_texts(df['question_text'])
X = tokenizer.texts_to_sequences(df['question_text'])
X = pad_sequences(X, maxlen=MAX_LEN)

y = df['target'].values

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# SMOTE to balance classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Model Definition 
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=50, input_length=MAX_LEN, trainable=True),
    SpatialDropout1D(0.2),
    LSTM(32, dropout=0.3, recurrent_dropout=0.3),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train Model
EPOCHS = 3
BATCH_SIZE = 32  

history = model.fit(X_train_resampled, y_train_resampled,
                    epochs=EPOCHS, batch_size=BATCH_SIZE,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping], verbose=1)

# Evaluate Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Save Model
model.save("spam_filter.keras")




Epoch 1/3
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2400s[0m 39ms/step - accuracy: 0.8981 - loss: 0.2526 - val_accuracy: 0.9458 - val_loss: 0.1439
Epoch 2/3
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2097s[0m 34ms/step - accuracy: 0.9514 - loss: 0.1356 - val_accuracy: 0.9492 - val_loss: 0.1425
Epoch 3/3
[1m61266/61266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1359s[0m 22ms/step - accuracy: 0.9543 - loss: 0.1286 - val_accuracy: 0.9478 - val_loss: 0.1359
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 6ms/step - accuracy: 0.9476 - loss: 0.1369




Test Accuracy: 0.95
