This program detects if an e-mail is spam(1) or not (0)

In [1]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sms-spam-collection-dataset/spam.csv


In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.expand_frame_repr',False)

Read the data

In [3]:
data = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")
data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will �_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
data["Unnamed: 4"].unique()

array([nan, ' just Keep-in-touch\\" gdeve.."', 'GNT:-)"',
       ' Never comfort me with a lie\\" gud ni8 and sweet dreams"',
       ' CALL 2MWEN IM BK FRMCLOUD 9! J X\\""',
       ' one day these two will become FREINDS FOREVER!"'], dtype=object)

Columns 2,3,4 contain no important data and can be deleted.
Also, we rename column v1 as "label" and v2 as "text"

In [6]:
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
column_name = ["Spam","Text"]
data.columns = column_name

In [7]:
data.head()

Unnamed: 0,Spam,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Update label names

In [8]:
data["Spam"].loc[data["Spam"] == "ham"] = 1
data["Spam"].loc[data["Spam"] == "spam"] = 0

In [9]:
data["Spam"].unique()

array([1, 0], dtype=object)

In [10]:
Spam = [liste for liste in data["Spam"]]

In [11]:
Spam = pd.DataFrame(Spam)
type(Spam)

pandas.core.frame.DataFrame

In [12]:
df = pd.concat([Spam,data["Text"]],axis=1)
df.head()

Unnamed: 0,0,Text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
column_name = ["Spam","Text"]
df.columns = column_name

In [14]:
df.head()

Unnamed: 0,Spam,Text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
type(df["Spam"])

pandas.core.series.Series

In [16]:
df.shape

(5572, 2)

In [17]:
df.columns

Index(['Spam', 'Text'], dtype='object')

In [18]:
df.isnull().any()

Spam    False
Text    False
dtype: bool

Stopwords package

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Remove Punctuation and Stopwords

In [20]:
def process(text):
    punc = [c for c in text if c not in string.punctuation]
    punc = ''.join(punc)
    stopw = [w for w in punc.split() if w.lower() not in stopwords.words('english') ]
    return stopw

In [21]:
df['Text'].head().apply(process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Text, dtype: object

Convert the text to a matrix of special word counts

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(df['Text'])

In [23]:
message.shape

(5572, 11301)

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(message, df['Spam'],test_size=0.2, random_state=0)

Naive Bayes Classifier

In [25]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(x_train,y_train)

In [26]:
#prediction
classifier.predict(x_train)

array([1, 1, 1, ..., 1, 1, 1])

In [27]:
#Actual
y_train.values

array([1, 1, 1, ..., 1, 1, 1])

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [29]:
def report(y_train, x_train):
    pred = classifier.predict(x_train)
    print('Report: \n',classification_report(y_train, pred))
    print('Confusion Matrix: \n', confusion_matrix(y_train, pred))
    print('\n')
    print('Accuracy: \n', accuracy_score(y_train, pred))

In [30]:
report(y_train, x_train)

Report: 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       581
           1       1.00      1.00      1.00      3876

    accuracy                           0.99      4457
   macro avg       0.99      0.99      0.99      4457
weighted avg       0.99      0.99      0.99      4457

Confusion Matrix: 
 [[ 569   12]
 [  12 3864]]


Accuracy: 
 0.9946152120260264
