# Email Spam Detection Using Python

# Task-2 By -Maitri Jain Data Science Intern at coderCave

In [None]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('spam_ham_dataset.csv')
df
df.head(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
df.shape

(5171, 4)

In [4]:
df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

# Data Cleaning and Processing


Removing unnecessary text that does not add any meaning to the email

In [5]:
import re

In [6]:
# define a function to clean text data using regular expressions

def clean_text(text):
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
        # tokenize the text
    tokens = re.split(r"\s", text)
        # convert tokens to lowercase
    tokens = [token.lower() for token in tokens]
        # remove stop words
    stop_words = set(['the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'it', 'with', 'for'])
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # join filtered tokens back into text
    clean_text = ' '.join(filtered_tokens)
    
    
    return clean_text

# apply the function to the 'text' column in the DataFrame
df['clean_text'] = df['text'].apply(clean_text)

In [7]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter 988291 this follo...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom january 9 2001 see attached fi...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho we re around mos...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject re indian springs this deal book teco ...
...,...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0,subject put 10 on ft transport volumes decreas...
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,subject 3 4 2000 following noms hpl can t take...
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0,subject calpine daily gas nomination julie as ...
5169,1409,ham,Subject: industrial worksheets for august 2000...,0,subject industrial worksheets august 2000 acti...


In [8]:
df['clean_text']

0       subject enron methanol meter 988291 this follo...
1       subject hpl nom january 9 2001 see attached fi...
2       subject neon retreat ho ho ho we re around mos...
3       subject photoshop windows office cheap main tr...
4       subject re indian springs this deal book teco ...
                              ...                        
5166    subject put 10 on ft transport volumes decreas...
5167    subject 3 4 2000 following noms hpl can t take...
5168    subject calpine daily gas nomination julie as ...
5169    subject industrial worksheets august 2000 acti...
5170    subject important online banking alert dear va...
Name: clean_text, Length: 5171, dtype: object

In [9]:
df['clean_text'] = df['clean_text'].str.replace('subject ', '')
df['clean_text']

0       enron methanol meter 988291 this follow up not...
1       hpl nom january 9 2001 see attached file hplno...
2       neon retreat ho ho ho we re around most wonder...
3       photoshop windows office cheap main trending a...
4       re indian springs this deal book teco pvr reve...
                              ...                        
5166    put 10 on ft transport volumes decreased from ...
5167    3 4 2000 following noms hpl can t take extra 1...
5168    calpine daily gas nomination julie as i mentio...
5169    industrial worksheets august 2000 activity att...
5170    important online banking alert dear valued cit...
Name: clean_text, Length: 5171, dtype: object

In [10]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol meter 988291 this follow up not...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,hpl nom january 9 2001 see attached file hplno...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat ho ho ho we re around most wonder...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,photoshop windows office cheap main trending a...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,re indian springs this deal book teco pvr reve...
...,...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0,put 10 on ft transport volumes decreased from ...
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,3 4 2000 following noms hpl can t take extra 1...
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0,calpine daily gas nomination julie as i mentio...
5169,1409,ham,Subject: industrial worksheets for august 2000...,0,industrial worksheets august 2000 activity att...


# Now that we have cleaned our text data so its time to split the train and test variable for this we will need to import train test split module from Scikitlearn library

In [11]:
import sklearn

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X = df['clean_text']
y = df['label_num'] 
X

0       enron methanol meter 988291 this follow up not...
1       hpl nom january 9 2001 see attached file hplno...
2       neon retreat ho ho ho we re around most wonder...
3       photoshop windows office cheap main trending a...
4       re indian springs this deal book teco pvr reve...
                              ...                        
5166    put 10 on ft transport volumes decreased from ...
5167    3 4 2000 following noms hpl can t take extra 1...
5168    calpine daily gas nomination julie as i mentio...
5169    industrial worksheets august 2000 activity att...
5170    important online banking alert dear valued cit...
Name: clean_text, Length: 5171, dtype: object

In [14]:
y

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
X_train

1023    re tenaska i see demand fee changes williams p...
4586    strong buy alert monthly newsletter topstocks ...
2955    performance feedback each you have been chosen...
2495    hr performance objectives binders good morning...
3353    fw fwd fw drawing by school age child pa fwd t...
                              ...                        
4426    re ena sales on hpl last i had was legal was r...
466     tenaska iv bob i understand from sandi you ll ...
3092    broom bristles up flew be differentiable onoma...
3772    calpine daily gas nomination weekend ricky arc...
860     re meter 1459 6 00 yep you re right except s o...
Name: clean_text, Length: 3619, dtype: object

In [18]:
X_test


1566    hpl nom march 30 2001 see attached file hplno ...
1988    online pharxmacy 80 off all meds disscount pha...
1235    re nom actual volume april 17 th we agree eile...
2868    re meter 8740 dec 99 robert i put our heads to...
4903    re coastal oil gas corporation melissa deal 34...
                              ...                        
5135    revision 1 enron hpl actuals august 3 2000 iss...
2298    re discrepancies price gas redelivered at mobi...
1519    well head here list meter i moved from lst on ...
1740    jordyn there nothing like dream create future ...
1700    union gas thamm 1 tom thamm 1 well came on lin...
Name: clean_text, Length: 1552, dtype: object

# Feature Extraction

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [20]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(max_df=0.7, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [21]:
y_test, y_train

(1566    0
 1988    1
 1235    0
 2868    0
 4903    0
        ..
 5135    0
 2298    0
 1519    0
 1740    1
 1700    0
 Name: label_num, Length: 1552, dtype: int32,
 1023    0
 4586    1
 2955    0
 2495    0
 3353    0
        ..
 4426    0
 466     0
 3092    1
 3772    0
 860     0
 Name: label_num, Length: 3619, dtype: int32)

In [22]:
print(X_test_features)

  (0, 40502)	0.34416867567520637
  (0, 27174)	0.1646067888834387
  (0, 24928)	0.197087795068719
  (0, 20406)	0.49655991440974956
  (0, 20401)	0.13037863127281704
  (0, 17089)	0.168138035912095
  (0, 6274)	0.14199092872157937
  (0, 1509)	0.6750070496138264
  (0, 1379)	0.16217284181745661
  (0, 932)	0.14721488399652838
  (1, 41154)	0.005154297078281474
  (1, 40971)	0.08870355300858138
  (1, 39930)	0.06910443214882987
  (1, 39514)	0.08147098934494065
  (1, 39303)	0.09552690324001073
  (1, 38929)	0.004029150619894842
  (1, 38814)	0.08870355300858138
  (1, 38684)	0.09552690324001073
  (1, 38141)	0.09552690324001073
  (1, 37901)	0.08447277448127313
  (1, 37653)	0.07300943229032412
  (1, 37471)	0.05054499158560902
  (1, 37239)	0.0037503107316411567
  (1, 37204)	0.08870355300858138
  (1, 37192)	0.002804269307207404
  :	:
  (1551, 20439)	0.12623257519242975
  (1551, 18381)	0.223172375027066
  (1551, 18195)	0.16824692943813288
  (1551, 18169)	0.14040130272918946
  (1551, 17544)	0.187354960638256

# Now that everything is done, we will finally train our machine learning logistic regression model on the above data


In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression()

# training the Logistic Regression model with the training data

In [25]:
model.fit(X_train_features, y_train)

# Our model if fitted and now its time to check the accuracy of model, but before we will Evaluate this model

In [26]:
prediction_on_training_data = model.predict(X_train_features)

In [27]:
prediction_on_training_data

array([0, 1, 0, ..., 1, 0, 0])

In [28]:
prediction_on_test_data = model.predict(X_test_features)

In [29]:
prediction_on_test_data

array([0, 1, 0, ..., 0, 1, 0])

In [30]:
from sklearn.metrics import accuracy_score


In [31]:
#Accuracy of prediction on training data

In [32]:
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [33]:
accuracy_on_training_data

0.9961315280464217

In [34]:
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)
accuracy_on_test_data

0.9903350515463918