# Spam Detection

In [1]:
# importing dependecies 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

In [2]:
# Load dataset
spam_data = pd.read_csv(r'Desktop/spam.csv', encoding='latin-1')

In [3]:
spam_data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
spam_data.shape

(5572, 5)

In [5]:
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
spam_data = spam_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1)

In [7]:
spam_data = spam_data.rename(columns={'v1':'Category','v2':'Messages'})
spam_data.columns

Index(['Category', 'Messages'], dtype='object')

In [8]:
spam_data.groupby('Category').describe()

Unnamed: 0_level_0,Messages,Messages,Messages,Messages
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [9]:
spam_data['spam'] = spam_data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [10]:
spam_data

Unnamed: 0,Category,Messages,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [11]:
# Create Train and test split
X_train , X_test , Y_train , Y_test = train_test_split(spam_data.Messages, spam_data.spam, test_size = 0.25)

In [12]:
# find word count
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train.values)

In [13]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#  Naive Bayes

In [14]:
nb = MultinomialNB()
nb.fit(X_train_count, Y_train)

In [15]:
X_test_count = cv.transform(X_test)
nb.score(X_test_count, Y_test)

0.9856424982053122

# Support Vector Machine

In [16]:
svm = svm.SVC(kernel='linear')
svm.fit(X_train_count, Y_train)

In [17]:
predict = svm.predict(X_test_count)
accuracy = svm.score(X_test_count, Y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.98
