**Spam Email Detection using Machine Learning**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [5]:
#no NUll values are here.And other way to check is:
data.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [6]:
data.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [7]:
data.shape

(5171, 4)

In [8]:
#from the data there is no use of the (Column1,Column4):(Unnamed:0,label_num)

In [9]:
data = data.drop(['Unnamed: 0','label_num'],axis=1)

In [10]:
data

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...
5169,ham,Subject: industrial worksheets for august 2000...


In [11]:
data['label'].unique()

array(['ham', 'spam'], dtype=object)

In [12]:
#ham indicates emails that are not spam
#spam indicates emails that are spam 
'''So we have 2 types of class which are spam and ham so we can use logistic regression to classify
whether emails are spam or not''' 

'So we have 2 types of class which are spam and ham so we can use logistic regression to classify\nwhether emails are spam or not'

In [13]:
#labeling spam as 0 and ham as 1 for the logistic regression.
data['label']=data['label'].map({'spam':0,'ham':1})

In [14]:
data

Unnamed: 0,label,text
0,1,Subject: enron methanol ; meter # : 988291\r\n...
1,1,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,1,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,0,"Subject: photoshop , windows , office . cheap ..."
4,1,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,1,Subject: put the 10 on the ft\r\nthe transport...
5167,1,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,1,Subject: calpine daily gas nomination\r\n>\r\n...
5169,1,Subject: industrial worksheets for august 2000...


In [15]:
x = data['label']

In [16]:
x

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: int64

In [17]:
y = data['text']

In [18]:
y

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [20]:
x_train = x_train.astype('int')
x_test = x_test.astype('int')

In [21]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5171,)
(4136,)
(1035,)


**Feature Extraction**

In [22]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
#min_df is tell the model that what is the minimum frequency for a word to consider.
#stop_words is a inbuilt function which is used to tell the model to remove the basic words like the,as,a etc.
#lowercase is used to lowercase all letters in the mail.

In [23]:
y_train_feat = feature_extraction.fit_transform(y_train)
y_test_feat = feature_extraction.transform(y_test)

In [24]:
print(y_train_feat) #Text is successfully converted into the numerical form.

  (0, 39122)	0.1489707406646257
  (0, 21876)	0.15452467049357102
  (0, 39295)	0.18296953062812027
  (0, 28331)	0.100413186925558
  (0, 16641)	0.1637097662727073
  (0, 25343)	0.1327903166471815
  (0, 24291)	0.086819999802958
  (0, 25011)	0.09352929013170533
  (0, 35741)	0.1797981565768853
  (0, 38670)	0.12368485263465394
  (0, 12407)	0.11614143442868556
  (0, 64)	0.2320330901784706
  (0, 72)	0.11287588933297091
  (0, 2003)	0.22022236239236187
  (0, 56)	0.19294689361509368
  (0, 28212)	0.13465733801362062
  (0, 26193)	0.13355578907978374
  (0, 17814)	0.15230033297810594
  (0, 27541)	0.15176632211845267
  (0, 12467)	0.19120758987148667
  (0, 35035)	0.2927898419075641
  (0, 4132)	0.5855796838151281
  (0, 26894)	0.29612657680252463
  (0, 37566)	0.03390898877474294
  (1, 2308)	0.12427819881705193
  :	:
  (4135, 36363)	0.10387970460445373
  (4135, 37874)	0.1099845935208329
  (4135, 2822)	0.12063179979974499
  (4135, 38317)	0.10810759560254152
  (4135, 40607)	0.10560544386588379
  (4135, 40291

**Logistic Regression Model.
Training the Model.**

In [25]:
model = LogisticRegression()


In [26]:
model.fit(y_train_feat,x_train)

LogisticRegression()

**Checking the Accuracy of the Model**

In [27]:
prediction_on_train_data = model.predict(y_train_feat)

In [28]:
accuracy_on_train_data = accuracy_score(x_train,prediction_on_train_data)
print(accuracy_on_train_data)

0.9961315280464217


In [29]:
prediction_on_test_data = model.predict(y_test_feat)
accuracy_on_test_data = accuracy_score(x_test,prediction_on_test_data)
print(accuracy_on_test_data)

0.9884057971014493


**Predicting the mail**

Here the input mail is taken form the dataset.

In [30]:
input_mail = ["Subject: re : epgtgloria , the difference between the two pipes for july 2000 is the actuals came in lower than what was nominated and scheduled on mops . there isn ' t anything we can do about that difference , hopefully there is some kind of oba that takes those variances .sabrafrom : barkowsky , gloria g .sent : friday , june 22 , 2001 4 : 50 pmto : garcia , clarissa ; farmer , daren j . ; dinari , sabra l subject : epgtclarissa - thanks so much for all your help with this pipe ! everything looks great . i just have a couple of pathsthat i need to finish it :january 2000 - i need deal # 854688 pathed for epgt and for tetc . according to the invoice , we should have11 , 129 dth on the interconnect .february 2000 - i need deal # 871184 pathed for hpl and chan . hpl should have 3 , 600 dth and chan shouldhave 11 , 500 dth on the interconnect .july 2000 - deal # 871172 has an interconnect issue . according to mops contract # 105124 , they received 8 , 275 dthon the matagorda 624 , but according to epgt , they delivered 10 , 362 dth to hpl ( ? ) could this possiblyneed to be split somehow , or do you have any other ideas ?let me know . thanks , gloria 3 - 7118"]
input_mail1 = ["The Exam fee is 200"] #user defined one
predict = feature_extraction.transform(input_mail1)
final_prediction = model.predict(predict)
print(final_prediction)
if final_prediction[0]== 1:
    print('Ham mail')
else:
    print('Spam mail')   

[1]
Ham mail
