# Import Necessary Libraries

In [13]:
import pandas as pd 
import numpy as np
import re 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load The Dataset

In [46]:
df = pd.read_csv('phishing_Email.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


# Clean The Data

# Step1. Handling Missing Values

In [15]:
print(df.isnull().sum())

Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64


# Step2. Drop Rows With Missing Values

In [16]:
df.dropna(inplace=True)

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


# Clean The Text

In [21]:
def clean_text(text):
    # REMOVE HTML TAGS
    text = re.sub(r'<.*?>','',text)
    
    # REMOVE EMAIL ADDRESSS
    text = re.sub(r'\s*@\s*\s?','',text)
    
    # REMOVE URLS
    text = re.sub(r'http\s+', '', text)
    
    # REMOVE NON-ALPHABETIC CHARACTERS
    text = re.sub(r'[^a-zA-Z]', '', text)
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    #  REMOVE EXTRA SPACES 
    text = ''.join(text.split())
    return text
    

# Apply Cleaning Functiom To The 'Email Text' column

In [23]:
df['Cleaned_Email'] = df['Email Text'].apply(clean_text)

# Encode Label

In [24]:
# IMITIALISED LabelEncoder
le = LabelEncoder()

# Encode Email Type As Binary Label
df['Label'] = le.fit_transform(df['Email Type'])

# Split The Data

In [29]:
# D Define feature and target 
x = df['Cleaned_Email']
y = df['Label']

# split data into training and testing test (80% for training and 20% for testing)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)


# Vectorise The Test Data 

In [30]:
# initialise TF-IDF vextorizer
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')

# fit and transform the training data
x_train_tfidf = vectorizer.fit_transform(x_train)

# transform the test data
x_test_tfidf = vectorizer.transform(x_test)


# Train the Logistic Regression Model

In [32]:
# initialise the Logistic Regression Model
model = LogisticRegression()

# train the model
model.fit(x_train_tfidf, y_train)

# Evaluate the Model

In [36]:
# predict on the test data
y_pred = model.predict(x_test_tfidf)

# Evaluate accuracy 
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy:, {accuracy * 100:.2f}%')

Accuracy:, 60.80%


# Test With New Emails

In [42]:
# sample emails for prediction
new_emails = [
    "Congratulations! You've won a $1000 gift card. click here to claim,",
    "Hi John, can you send me the report by EOD? Thanks!"]
    

In [50]:
# clean the new emails 
new_emails_cleaned = [clean_text(email) for email in new_emails]

# transform the new email using the trained vectorizer
new_emails_tfidf = vectorizer.transform(new_emails_cleaned)

# Predict using the trained model
predictions = model.predict(new_emails_tfidf)

# Decode predictions
decoded_predictions = le.inverse_transform(predictions)
print(decoded_predictions)

['Safe Email' 'Safe Email']
