<a href="https://colab.research.google.com/github/IamPrachiSharma/Phishing-Email-Detection/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np  # Add this line
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load your dataset
data = pd.read_csv('Phishing_Email.csv')

# Drop rows with NaN values in 'email_text' column
data = data.dropna(subset=['Email Text'])

# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['Email Text'], data['Email Type'], test_size=0.2, random_state=42)

# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data), maxlen=100)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data), maxlen=100)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Train Word2Vec embeddings on the training data
word2vec_model = Word2Vec(sentences=train_data.apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)

# Create an embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Build a neural network model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=100, weights=[embedding_matrix], trainable=False))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, train_labels_encoded, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
predictions = (model.predict(X_test) > 0.5).astype(int).flatten()

# Evaluate the model
accuracy = accuracy_score(test_labels_encoded, predictions)
conf_matrix = confusion_matrix(test_labels_encoded, predictions)
class_report = classification_report(test_labels_encoded, predictions)

# Display the results
print(f'Accuracy: {accuracy:.2f}')
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.92

Confusion Matrix:
[[1354  164]
 [ 144 2065]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1518
           1       0.93      0.93      0.93      2209

    accuracy                           0.92      3727
   macro avg       0.92      0.91      0.91      3727
weighted avg       0.92      0.92      0.92      3727

