In [74]:
#required libraries.
import pandas as pd #data manipualtion and analysing (DataFrame)
import matplotlib.pyplot as plt  #Creating statc visualiztion 
import seaborn as sns  # improve static visulisation
import numpy as np #numeric operation
import plotly.express as px #interactive visualizations, like scatter plots, bar charts.


# sklearn's TfidfVectorizer converts text into TF-IDF features for text analysis.
# CountVectorizer converts text into a matrix of token counts.

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier  #(Multi-Layer Perceptron) is a neural network-based model used for classification.
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,GRU,LSTM,Bidirectional,SimpleRNN
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense,Dropout
import tensorflow as tf
import warnings

warnings.filterwarnings('ignore')

In [75]:
df = pd.read_csv("Phishing_Email.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [76]:
df.isnull().sum()

Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64

In [77]:
print(df.columns)



Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')


In [78]:
print(df.columns.tolist())




['Unnamed: 0', 'Email Text', 'Email Type']


In [79]:
#Remove Nan Value in Email Text. I  remove rows where "Email Text" is missing (16 rows)
df.dropna(subset=["Email Text"], inplace=True)


In [80]:
#Check missing values
df.isnull().sum()



Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64

In [81]:
#Change colunm name taht mean rename
df.rename(columns={"Unnamed: 0": "Email_count"}, inplace=True)


In [82]:
df.head()

Unnamed: 0,Email_count,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [83]:
#shape of data set
print("Dimension of the row data:",df.shape)

Dimension of the row data: (18634, 3)


In [47]:
# Create the bar chart
bar = px.bar(df['Email Type'].value_counts(), x=df['Email Type'].value_counts().index, y=df['Email Type'].value_counts().values,
             color=['blue', 'red'], labels={'x': 'Category Safe Email and Phishing Email', 'y': 'Count of Emails'},
             title="Categorical Distribution")

# Show the plot
bar.show()

In [84]:
# Create the pie chart
fig_pie = px.pie(df['Email Type'].value_counts(), names=df['Email Type'].value_counts().index,
                 values=df['Email Type'].value_counts().values, title="Category Safe Email and Phishing Email")

# Show the pie chart
fig_pie.show()

In [None]:
#Preprocessing data

In [86]:

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Function to clean HTML
def clean_html(text):
    
    return BeautifulSoup(text, "html.parser").get_text()

# Function to convert text to lowercase
def to_lowercase(text):
    
    return text.lower()

# Function to remove special characters
def remove_special_characters(text):
    
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Function to remove stopwords
def remove_stopwords(text):
    
    stop_words = set(stopwords.words('english'))
    
    return ' '.join(word for word in text.split() if word not in stop_words)

# Pipeline function to preprocess text
def preprocess_text(text):
    text = clean_html(text)  # Step 1: Clean HTML
    text = to_lowercase(text)  # Step 2: Lowercase
    text = remove_special_characters(text)  # Step 3: Remove special characters
    text = remove_stopwords(text)  # Step 4: Remove stopwords
    
    return text

df['Cleaned Text'] = df['Email Text'].apply(preprocess_text) #create New column as Cleaned Text





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hiros\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
df.head()

Unnamed: 0,Email_count,Email Text,Email Type,Cleaned Text
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,disc uniformitarianism sex lang dick hudson ob...
1,1,the other side of * galicismos * * galicismo *...,Safe Email,side galicismos galicismo spanish term names i...
2,2,re : equistar deal tickets are you still avail...,Safe Email,equistar deal tickets still available assist r...
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,hello hot lil horny toy one dream open minded ...
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,software incredibly low prices lower drapery s...


In [None]:
#Tokenization
#Convert the cleaned text into tokens for further processing

In [88]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize_text(text):
    return word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hiros\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [95]:

df['Tokenize Text'] = df['Cleaned Text'].apply(tokenize_text)

In [96]:
df.head()

Unnamed: 0,Email_count,Email Text,Email Type,Cleaned Text,Tokenize Text,Stemmed Text
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,disc uniformitarianism sex lang dick hudson ob...,"[disc, uniformitarianism, sex, lang, dick, hud...","[re, :, 6, ., 1100, ,, disc, :, uniformitarian..."
1,1,the other side of * galicismos * * galicismo *...,Safe Email,side galicismos galicismo spanish term names i...,"[side, galicismos, galicismo, spanish, term, n...","[the, other, side, of, *, galicismo, *, *, gal..."
2,2,re : equistar deal tickets are you still avail...,Safe Email,equistar deal tickets still available assist r...,"[equistar, deal, tickets, still, available, as...","[re, :, equistar, deal, ticket, are, you, stil..."
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,hello hot lil horny toy one dream open minded ...,"[hello, hot, lil, horny, toy, one, dream, open...","[hello, i, am, your, hot, lil, horni, toy, ., ..."
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,software incredibly low prices lower drapery s...,"[software, incredibly, low, prices, lower, dra...","[softwar, at, incred, low, price, (, 86, %, lo..."


In [97]:
#Stemming and Lemmatization
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]


In [98]:
df['Stemmed Text'] = df['Tokenize Text'].apply(stem_words)

In [100]:
df.head()

Unnamed: 0,Email_count,Email Text,Email Type,Cleaned Text,Tokenize Text,Stemmed Text
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,disc uniformitarianism sex lang dick hudson ob...,"[disc, uniformitarianism, sex, lang, dick, hud...","[disc, uniformitarian, sex, lang, dick, hudson..."
1,1,the other side of * galicismos * * galicismo *...,Safe Email,side galicismos galicismo spanish term names i...,"[side, galicismos, galicismo, spanish, term, n...","[side, galicismo, galicismo, spanish, term, na..."
2,2,re : equistar deal tickets are you still avail...,Safe Email,equistar deal tickets still available assist r...,"[equistar, deal, tickets, still, available, as...","[equistar, deal, ticket, still, avail, assist,..."
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,hello hot lil horny toy one dream open minded ...,"[hello, hot, lil, horny, toy, one, dream, open...","[hello, hot, lil, horni, toy, one, dream, open..."
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,software incredibly low prices lower drapery s...,"[software, incredibly, low, prices, lower, dra...","[softwar, incred, low, price, lower, draperi, ..."


In [99]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hiros\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [104]:
df['Lemmatized Text'] = df['Stemmed Text'].apply(lemmatize_words)

In [102]:
df.head()

Unnamed: 0,Email_count,Email Text,Email Type,Cleaned Text,Tokenize Text,Stemmed Text
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,disc uniformitarianism sex lang dick hudson ob...,"[disc, uniformitarianism, sex, lang, dick, hud...","[disc, uniformitarian, sex, lang, dick, hudson..."
1,1,the other side of * galicismos * * galicismo *...,Safe Email,side galicismos galicismo spanish term names i...,"[side, galicismos, galicismo, spanish, term, n...","[side, galicismo, galicismo, spanish, term, na..."
2,2,re : equistar deal tickets are you still avail...,Safe Email,equistar deal tickets still available assist r...,"[equistar, deal, tickets, still, available, as...","[equistar, deal, ticket, still, avail, assist,..."
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,hello hot lil horny toy one dream open minded ...,"[hello, hot, lil, horny, toy, one, dream, open...","[hello, hot, lil, horni, toy, one, dream, open..."
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,software incredibly low prices lower drapery s...,"[software, incredibly, low, prices, lower, dra...","[softwar, incr, low, price, lower, draperi, se..."
