In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import classification_report
import re

In [2]:
data=pd.read_csv("Phishing_Email.csv")
data["Email Type"]=np.where(data["Email Type"]=="Safe Email",0,1)
# safe==0 /|\ Physhing==1
data.drop("Unnamed: 0",axis=1,inplace=True)
data["Email Text"] = data["Email Text"].fillna(" ")
data

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,the other side of * galicismos * * galicismo *...,0
2,re : equistar deal tickets are you still avail...,0
3,\nHello I am your hot lil horny toy.\n I am...,1
4,software at incredibly low prices ( 86 % lower...,1
...,...,...
18645,date a lonely housewife always wanted to date ...,1
18646,request submitted : access request for anita ....,0
18647,"re : important - prc mtg hi dorn & john , as y...",0
18648,press clippings - letter on californian utilit...,0


In [3]:
data.isna().sum()

Email Text    0
Email Type    0
dtype: int64

### STEMMING THE TEXT

In [4]:
porter=PorterStemmer()
def stm(data):
    stem_data=re.sub("[^a-zA-Z]"," ",data)
    stem_data=stem_data.lower()
    stem_data=stem_data.split()
    stem_data=[porter.stem(word) for word in stem_data if word not in stopwords.words("english")]
    stem_data=" ".join(stem_data)
    return stem_data

In [5]:
#zeros=data[data["Email Type"]==0]
#ones=data[data["Email Type"]==1]
#b_data=pd.concat([zeros,ones])
#b_data=b_data.sample(len(b_data),random_state=34)

In [6]:
data["stemed_email_text"]=data["Email Text"].apply(stm)

In [8]:
data

Unnamed: 0,Email Text,Email Type,stemed_email_text
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0,disc uniformitarian sex lang dick hudson obser...
1,the other side of * galicismos * * galicismo *...,0,side galicismo galicismo spanish term name imp...
2,re : equistar deal tickets are you still avail...,0,equistar deal ticket still avail assist robert...
3,\nHello I am your hot lil horny toy.\n I am...,1,hello hot lil horni toy one dream open mind pe...
4,software at incredibly low prices ( 86 % lower...,1,softwar incred low price lower draperi sevente...
...,...,...,...
18645,date a lonely housewife always wanted to date ...,1,date lone housewif alway want date lone housew...
18646,request submitted : access request for anita ....,0,request submit access request anita dupont enr...
18647,"re : important - prc mtg hi dorn & john , as y...",0,import prc mtg hi dorn john discov recent stil...
18648,press clippings - letter on californian utilit...,0,press clip letter californian util pleas find ...


### TRAIN_TEST_SPLIT

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x=data["stemed_email_text"]
y=data["Email Type"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

### VECTORISING STRING TO NUMERIC

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vc=TfidfVectorizer()
x_train=vc.fit_transform(x_train)
x_test=vc.transform(x_test)

### MODELING AND TRAINING

In [13]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)

In [14]:
model.score(x_train,y_train)

0.9799099163508973

In [15]:
train_prediction=model.predict(x_train)

report=classification_report(y_train,train_prediction)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      8518
           1       0.96      0.99      0.97      5469

    accuracy                           0.98     13987
   macro avg       0.98      0.98      0.98     13987
weighted avg       0.98      0.98      0.98     13987



### TESTING

In [16]:
test_prediction=model.predict(x_test)

report=classification_report(y_test,test_prediction)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2804
           1       0.96      0.97      0.96      1859

    accuracy                           0.97      4663
   macro avg       0.97      0.97      0.97      4663
weighted avg       0.97      0.97      0.97      4663



# TESTING WITH NEW DATA

In [17]:
test=data.sample()
test["stemed_email_text"]=test["Email Text"].apply(stm)
test

Unnamed: 0,Email Text,Email Type,stemed_email_text
14712,Eliminate your credit card debt without filin...,1,elimin credit card debt without file bankruptc...


In [18]:
test_message=vc.transform(test["stemed_email_text"])

In [19]:
model.predict(test_message)

array([1])

In [20]:
def testing(message):
    test["stemed_email_text"]=message.apply(stm)
    message= test["stemed_email_text"].fillna("").astype(str)
    test_message=vc.transform(message)
    if model.predict(test_message)==1:
        return "Its A Spam Message Cautions"
    else:
        return print("Its All good\nYou Have A New Message")

In [21]:
test=data.sample()
test["stemed_email_text"]=test["Email Text"].apply(stm)
test

Unnamed: 0,Email Text,Email Type,stemed_email_text
10810,"re : deal 156071 feb 00 daren , i must be miss...",0,deal feb daren must miss someth meet today sho...


In [22]:
testing(test["Email Text"])

Its All good
You Have A New Message


In [23]:
import pandas as pd

# Creating a new professional message
data_author_message = {
    "message": [
        '''Subject: Meeting Agenda for Thursday's Product Review

Message: Dear Team,

I hope this message finds you well. As we approach our Thursday product review meeting, I wanted to share the proposed agenda so we can all come prepared:

1. Progress on the Q2 roadmap
2. Feature requests and prioritization
3. Customer feedback from the latest release
4. Next sprint planning

Please let me know if you have any items to add or if there are any scheduling conflicts. Looking forward to a productive session.

Best regards,
Anna Rodriguez
Product Manager
TechNova Solutions'''
    ]
}
df= pd.DataFrame(data_author_message)

In [24]:
testing(df["message"])

'Its A Spam Message Cautions'

In [25]:
data_new_message = {
    "message": [
        '''Subject: Reminder: Submit Your Monthly Report

Message: Hi Alex,

Just a friendly reminder to submit your monthly progress report by Friday, end of day. This helps us keep track of ongoing projects and identify any areas where support is needed.

If you have any questions or need extra time, feel free to reach out.

Thanks in advance!

Best,  
Jordan Lee  
Operations Coordinator'''
    ]
}
df= pd.DataFrame(data_new_message)
df

Unnamed: 0,message
0,Subject: Reminder: Submit Your Monthly Report\...


In [26]:
testing(df["message"])

'Its A Spam Message Cautions'

In [27]:
# Creating a spam message for the DataFrame
data_spam_message = {
    "message": [
        '''Subject: You've Won a $1000 Cash Prize!

Message: Congratulations! You've been selected to receive a $1000 cash prize. All you need to do to claim your prize is follow the link below and fill out your personal information, including bank account details, to receive the funds.

Don't miss out on this incredible opportunity, but hurry, time is running out!

Claim your cash prize now: [link]

Best regards,  
The Prize Distribution Team  
**Important**: This is an urgent message, please do not ignore it!'''
    ]
}

df= pd.DataFrame(data_spam_message)
df

Unnamed: 0,message
0,Subject: You've Won a $1000 Cash Prize!\n\nMes...


In [29]:
testing(df["message"])

'Its A Spam Message Cautions'