## This program detects if an email is spam(1) or not(0)

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string

#### Reading the csv file

In [2]:
df = pd.read_csv('emails.csv')

#### Print the first 5 rows of the dataset

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
6,2793,ham,Subject: spring savings certificate - take 30 ...,0
7,4185,spam,Subject: looking for medication ? we ` re the ...,1
8,2641,ham,Subject: noms / actual flow for 2 / 26\r\nwe a...,0
9,1870,ham,"Subject: nominations for oct . 21 - 23 , 2000\...",0


#### Shape of our dataset

In [4]:
df.shape

(5171, 4)

#### Column names

In [5]:
df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

#### Drop the duplicates

In [6]:
df.drop_duplicates(inplace = True)

#### Check the shape of dataset after dropping the duplicate values

In [7]:
df.shape

(5171, 4)

#### Show the number of missing data(NAN, na, NaN)

In [8]:
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

#### Download the stopwards package

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Nachiketa
[nltk_data]     Dhal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def process_text(text) :
    
    #1 Remove punctuation
    #2 Remove stopwords(Useless words or data)
    #3 return a list of clean text words
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words
    

#### Show the tokenization

In [11]:
df['text'].head(5).apply(process_text)

0    [Subject, enron, methanol, meter, 988291, foll...
1    [Subject, hpl, nom, january, 9, 2001, see, att...
2    [Subject, neon, retreat, ho, ho, ho, around, w...
3    [Subject, photoshop, windows, office, cheap, m...
4    [Subject, indian, springs, deal, book, teco, p...
Name: text, dtype: object

## Example

In [12]:
message_1 = 'hello world hello hello world Nachi'
message_2 = 'test test test test one hello'
print(message_1)
print()

# Convert text to matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
eg = CountVectorizer(analyzer = process_text).fit_transform([[message_1], [message_2]])
print(eg)
eg.shape

hello world hello hello world Nachi

  (0, 1)	3
  (0, 4)	2
  (0, 0)	1
  (1, 1)	1
  (1, 3)	4
  (1, 2)	1


(2, 5)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
message_9 = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

### Split the data into 80% training and 20% testing

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(message_9, df['label_num'], test_size = 0.2, random_state = 0)

In [15]:
message_9.shape

(5171, 50381)

In [16]:
y_train.shape


(4136,)

### Craete Classifier

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(x_train, y_train)

In [18]:
# Print predictions
print(clf.predict(x_train))

# Print actual values
print(y_train.values)

[0 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]


### Evaluate the model

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = clf.predict(x_train)
print(classification_report(y_train, pred))
print()
print('confusion matrix : ', confusion_matrix(y_train, pred))
print()
print('accuracy :' ,accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2940
           1       0.98      0.97      0.98      1196

    accuracy                           0.99      4136
   macro avg       0.99      0.98      0.98      4136
weighted avg       0.99      0.99      0.99      4136


confusion matrix :  [[2918   22]
 [  30 1166]]

accuracy : 0.9874274661508704


In [20]:
pred = clf.predict(x_test)
print(classification_report(y_test, pred))
print()
print('confusion matrix : ', confusion_matrix(y_test, pred))
print()
print('accuracy :' ,accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       732
           1       0.95      0.96      0.96       303

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035


confusion matrix :  [[718  14]
 [ 13 290]]

accuracy : 0.9739130434782609
