# Email Spam Classifier


Completed by: Khalid Rajan


This is an implementation of Logitic Regression to classify spam emails by looking at the email body text

In [7]:
import pandas as pd
import nltk
nltk.download('stopwords')
import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/khalidrajan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data=pd.read_csv("spam_ham_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [9]:
#Extract the label and email text columns
data=data[['label', 'text']]
data.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


## Extracting the Body Text from Emails

In [12]:
emails=data.loc[:, "text"]
#initialize empty array to hold the email body text
email_body=[]
#extract the body text and remove subject lines from the email
for email in emails:
    #seperate the subject line from the body text by splitting the email text after the first newline character
    email_lines=email.rsplit("\n")
    #convert one-element array to string and append to empty_body array
    email_body.append(''.join(email_lines[1:]))

In [13]:
#add the email_body column to the dataframe
data['email_body']=email_body
#remove the text column, as it is no longer needed
data=data[['label', 'email_body']]
data.head()

Unnamed: 0,label,email_body
0,ham,this is a follow up to the note i gave you on ...
1,ham,( see attached file : hplnol 09 . xls )\r- hpl...
2,ham,"ho ho ho , we ' re around to that most wonderf..."
3,spam,abasements darer prudently fortuitous undergon...
4,ham,this deal is to book the teco pvr revenue . it...


## Data Preprocessing

In [14]:
#To find the stem words in the email body text
stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()


def clean_text(text):
    #remove punctuation
    text="".join([word.lower() for word in text if word not in string.punctuation])
    #split text into tokens
    tokens=re.split('\W+', text)
    #find the stem word and remove stopwords
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text

## TF-IDF Vectorization

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['email_body'])

## Feature Engineering

In [16]:
#count length of messgae
data['body_len']=data['email_body'].apply(lambda x: len(x)-x.count(" "))
X_features=pd.concat([data['body_len'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,0,1,2,3,4,5,6,7,8,...,41993,41994,41995,41996,41997,41998,41999,42000,42001,42002
0,225,0.066724,0.0,0.126065,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,44,0.095006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,314,0.032629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,242,0.061984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train-Test-Split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2, random_state=20) 

## Logistic Regression Model

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(X_train, y_train)
pred = Spam_model.predict(X_test)
accuracy_score(y_test,pred)

0.957487922705314