In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate
from sklearn.metrics import classification_report

# Load the merged CSV file; contains preprocessed text and additional features
df = pd.read_csv('merged_data.csv')

# Extract preprocessed text, additional features, and labels
X_text = df['lemmatized_tokens']
X_features = df[['html_tag_count', 'css_tag_count', 'spam_phrase_count', 'link_count', 'grammar_error_count', 'misspelled_word_count']]
y = df['Class_Label']

# Convert labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Define maximum sequence length
max_len = 100  # Adjust as needed

# Tokenize and pad the sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text)
X_text = tokenizer.texts_to_sequences(X_text)
X_text = pad_sequences(X_text, maxlen=max_len)

# Get the size of the vocabulary
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token

# Split the data into training and testing sets
X_text_train, X_text_test, X_features_train, X_features_test, y_train, y_test = train_test_split(X_text, X_features, y, test_size=0.2, random_state=42)

# Define the LSTM model architecture
embedding_dim = 100  # Adjust as needed
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Define input layer for additional features
feature_input = Input(shape=(X_features.shape[1],))

# Concatenate LSTM output with additional features
concatenated = Concatenate()([model.output, feature_input])

# Add dense layers for further processing
concatenated = Dense(64, activation='relu')(concatenated)
output = Dense(1, activation='sigmoid')(concatenated)

# Define the model
model = Model(inputs=[model.input, feature_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_text_train, X_features_train], y_train, epochs=10, batch_size=32, validation_data=([X_text_test, X_features_test], y_test))

# Evaluate the model
loss, accuracy = model.evaluate([X_text_test, X_features_test], y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Generate classification report
y_pred = model.predict([X_text_test, X_features_test])
y_pred_classes = (y_pred > 0.5).astype(int)
print("Classification Report:")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.11297600716352463, Test Accuracy: 0.9725000262260437
Classification Report:
              precision    recall  f1-score   support

  Fraudulent       0.96      0.98      0.97       796
      Normal       0.98      0.96      0.97       804

    accuracy                           0.97      1600
   macro avg       0.97      0.97      0.97      1600
weighted avg       0.97      0.97      0.97      1600

