In [59]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
data = pd.read_csv("/content/OnionOrNot.csv")


In [61]:
def clean_text(text):
    """Cleans the input text by performing a few basic operations."""

    # Step 1: Convert all text to lowercase
    text = text.lower()

    # Step 2: Remove any characters that are not alphabets (remove numbers, punctuation, etc.)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Step 3: Split the text into a list of words
    words = text.split()

    # Step 4: Remove common stopwords (like "and", "the", "is", etc.)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Step 5: Join the words back into a cleaned string
    cleaned_text = ' '.join(words)

    return cleaned_text


In [62]:
# Apply cleaning to the text column

data['cleaned_text'] = data['text'].apply(clean_text)

In [63]:
# Step 3: Tokenization and Padding
# Tokenizer converts text to a sequence of integers (word indices)
max_words = 10000  # Maximum number of words to consider in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['cleaned_text'])

In [64]:
# Convert text to sequences and pad them to ensure uniform input size
sequences = tokenizer.texts_to_sequences(data['cleaned_text'])
max_length = max(len(seq) for seq in sequences)  # Define max sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


In [65]:
# Step 4: Prepare labels and split data
labels = data['label']
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


# **LSTM**

In [66]:
#Step 5: Build the LSTM Model
lstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_length),  # Embedding layer
    LSTM(64, return_sequences=False),  # LSTM layer with 64 units
    tf.keras.layers.Dropout(0.3),  # Dropout for regularization # Changed to tf.keras.layers.Dropout
    Dense(64, activation='relu'),  # Fully connected dense layer
    tf.keras.layers.Dropout(0.3),  # Additional dropout # Changed to tf.keras.layers.Dropout
    Dense(1, activation='sigmoid')  # Output layer (sigmoid for binary classification)
])



In [67]:
# Compile the LSTM Model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [68]:
# Add Early Stopping Callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)



In [69]:
# Train the LSTM Model
lstm_epochs = 50  # Set to 50 epochs
batch_size = 32
lstm_history = lstm_model.fit(X_train, y_train, validation_split=0.2, epochs=lstm_epochs, batch_size=batch_size, verbose=1, callbacks=[early_stopping])


Epoch 1/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 7ms/step - accuracy: 0.6671 - loss: 0.6242 - val_accuracy: 0.7724 - val_loss: 0.5379
Epoch 2/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.7694 - loss: 0.5290 - val_accuracy: 0.6203 - val_loss: 0.6698
Epoch 3/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6309 - loss: 0.6612 - val_accuracy: 0.6203 - val_loss: 0.6658
Epoch 4/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.6307 - loss: 0.6603 - val_accuracy: 0.6203 - val_loss: 0.6666


In [70]:
# Evaluate the LSTM Model
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
print(f"LSTM Model Test Loss: {lstm_loss:.4f}")
print(f"LSTM Model Test Accuracy: {lstm_accuracy:.4f}")


LSTM Model Test Loss: 0.5255
LSTM Model Test Accuracy: 0.7808


# **BiLSTM**

In [71]:
from tensorflow.keras.layers import Bidirectional  # Import Bidirectional


# Step 5: Build the BiLSTM Model
bilstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_length),  # Embedding layer
    Bidirectional(LSTM(64, return_sequences=False)),  # Bidirectional LSTM layer with 64 units
    tf.keras.layers.Dropout(0.3),  # Dropout for regularization
    Dense(64, activation='relu'),  # Fully connected dense layer
    tf.keras.layers.Dropout(0.3),  # Additional dropout
    Dense(1, activation='sigmoid')  # Output layer (sigmoid for binary classification)
])

In [72]:
# Compile the BiLSTM Model
bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [73]:

# Step 7: Add Early Stopping Callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [74]:
# Train the BiLSTM Model
bilstm_epochs = 50  # Set to 50 epochs
bilstm_history = bilstm_model.fit(X_train, y_train, validation_split=0.2, epochs=bilstm_epochs, batch_size=batch_size, verbose=1, callbacks=[early_stopping])


Epoch 1/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7112 - loss: 0.5407 - val_accuracy: 0.8409 - val_loss: 0.3561
Epoch 2/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9224 - loss: 0.2052 - val_accuracy: 0.8427 - val_loss: 0.3901
Epoch 3/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9593 - loss: 0.1097 - val_accuracy: 0.8336 - val_loss: 0.5712
Epoch 4/50
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9822 - loss: 0.0543 - val_accuracy: 0.8240 - val_loss: 0.7204


In [75]:
# Evaluate the BiLSTM Model
bilstm_loss, bilstm_accuracy = bilstm_model.evaluate(X_test, y_test, verbose=0)
print(f"BiLSTM Model Test Loss: {bilstm_loss:.4f}")
print(f"BiLSTM Model Test Accuracy: {bilstm_accuracy:.4f}")


BiLSTM Model Test Loss: 0.3429
BiLSTM Model Test Accuracy: 0.8458


In [76]:
# Step 7: Compare Models
print("\nComparison of Models:")
print(f"LSTM Model - Loss: {lstm_loss:.4f}, Accuracy: {lstm_accuracy:.4f}")
print(f"BiLSTM Model - Loss: {bilstm_loss:.4f}, Accuracy: {bilstm_accuracy:.4f}")



Comparison of Models:
LSTM Model - Loss: 0.5255, Accuracy: 0.7808
BiLSTM Model - Loss: 0.3429, Accuracy: 0.8458


In [77]:
# Generate Classification Reports
print("\nLSTM Model Classification Report:")
lstm_predictions = (lstm_model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, lstm_predictions))



LSTM Model Classification Report:
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      3018
           1       0.74      0.63      0.68      1782

    accuracy                           0.78      4800
   macro avg       0.77      0.75      0.76      4800
weighted avg       0.78      0.78      0.78      4800



In [78]:
print("\nBiLSTM Model Classification Report:")
bilstm_predictions = (bilstm_model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, bilstm_predictions))



BiLSTM Model Classification Report:
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      3018
           1       0.82      0.75      0.78      1782

    accuracy                           0.85      4800
   macro avg       0.84      0.83      0.83      4800
weighted avg       0.84      0.85      0.84      4800

