# Email Spam Filtering Using Machine Learning

It filters out the Ham (Non-Spam) and Spam Emails using SVM Model

### Importing Libraries

In [144]:
# Importing all the required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print('Import Completed!')

Import Completed!


### Reading the data 

In [146]:
# Loading the data-set and reading it in pandas data-frame
data = pd.read_csv("spam.csv", encoding="latin-1")

# Viewing first 5 rows
data.head()

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Data Pre-Processing

In [147]:
# Shape of the data
data.shape

(5572, 5)

In [148]:
# Viewing all the columns
data.columns

Index(['class', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [149]:
# Dropping the unnecessary columns
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [150]:
# Final data-set after cleaning
data.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [151]:
# labelling spam mail as 1; Non-spam mail (ham) as 0.
data['class']=data['class'].map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,class,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [152]:
# Separate the data as Input and Output. X --> message (Email) ; Y --> class (Ham/Spam)
X = data['message']
y = data['class']

In [155]:
#checking for null values
data.isnull().sum()

class      0
message    0
dtype: int64

In [156]:
print(X)
print('...................')
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object
...................
0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: class, Length: 5572, dtype: int64


### Train Test Split

In [157]:
# Splitting the data in ratio of 80:20 of train-test 
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=42)
print('Data Splitted!')

Data Splitted!


### Feature Extraction: 

In [158]:
#cv = CountVectorizer()
cv = CountVectorizer(min_df=1, stop_words='english', lowercase='True')
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

# Convert y_train and y_test values as integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [159]:
x_train.shape

(4457, 7472)

In [160]:
x_test.shape

(1115, 7472)

### Training the Model - Multinomial Naive Bayes

In [161]:
from sklearn.naive_bayes import MultinomialNB

In [162]:
model = MultinomialNB()

In [163]:
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Evaluation of the model

In [166]:
# Prediction on test data
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)

print('Accuracy of Model on Test Data : ', accuracy)

Accuracy of Model on Test Data :  0.9838565022421525


In [167]:
#acc = model.score(x_test, y_test)
#print('Accuracy of Model on test data : ', acc)

### Prediction on new mail

In [138]:
msg="You Won 500$"
data = [msg]
vect = cv.transform(data).toarray()
my_prediction = model.predict(vect)

print(my_prediction)


if (my_prediction[0]==0):
    print('This is not a Spam mail')
else:
    print('This is a Spam mail')

[1]
This is a Spam mail


In [141]:
msg="Your interview is scheduled tomorrow at 8 am."
data = [msg]
vect = cv.transform(data).toarray()
my_prediction = model.predict(vect)

print(my_prediction)


if (my_prediction[0]==0):
    print('This is not a Spam mail')
else:
    print('This is a Spam mail')

[0]
This is not a Spam mail


In [104]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [105]:
import pickle
pickle.dump(model, open('spam.pkl','wb'))
pickle.dump(cv, open('vectorizer.pkl','wb'))
model1 = pickle.load(open('spam.pkl','rb'))


In [106]:
from win32com.client import Dispatch

In [107]:
def speak(text):
	speak=Dispatch(("SAPI.SpVoice"))
	speak.Speak(text)

In [108]:
def result(msg):
    data = [msg]
    vect = cv.transform(data).toarray()
    my_prediction = model1.predict(vect)
    if my_prediction[0]==1:
        speak("This is a Spam mail")
        print("This is a Spam mail")
    else:
        speak("This is not a Spam mail")
        print("This is not a Spam mail")

In [109]:
import tkinter as tk

In [110]:
root=tk.Tk()
root.geometry("200x200")
l2=tk.Label(root, text="Email Spam Classification Application")
l2.pack()
l1=tk.Label(root, text="Enter Your Message:")
l1.pack()
text=tk.Entry(root)
text.pack()
def result():
    data = [text.get()]
    vect = cv.transform(data).toarray()
    my_prediction = model1.predict(vect)
    if my_prediction[0]==1:
        speak("This is a Spam mail")
        print("This is a Spam mail")
    else:
        speak("This is not a Spam mail")
        print("This is not a Spam mail")
B=tk.Button(root, text="Click", command=result)
B.pack()

root.mainloop()

This is a Spam mail
