#Email Spam detection using Logistic Regression

In [1]:
#importing major libraries needed
import csv
import pandas as pd
import string
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
#reading the dataset using pandas
data = pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
#data cleaning
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#function to process the text messages and remove stopwords etc. from them
def text_processing(text):
    text = text.translate(str.maketrans('','',string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text) 

In [5]:
data['text'] = data['text'].apply(text_processing)

In [6]:
data.head()

Unnamed: 0,label,text
0,ham,Go jurong point crazy Available bugis n great ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say early hor U c already say
4,ham,Nah dont think goes usf lives around though


In [7]:
#Converting words to vector using TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data['text'])
features = vectors

In [8]:
#labelling 'ham' messages to 0 and 'spam' messages to 1
def categorize(category):
    if category == "ham":
      return 0
    else:
      return 1
data['label'] = data['label'].apply(categorize)

In [9]:
#creating train and test splits in dataset, making train dataset and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, data['label'], test_size=0.15, random_state=111)

In [10]:
#creating a model object for Logistic Regression
lr = LogisticRegression(solver='liblinear', penalty='l1')

#training the model on training dataset
lr.fit(X_train,y_train)

#testing the model on test dataset, and making predictions on test dataset
predictions = lr.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score, confusion_matrix

#printing acuracy_score
print(accuracy_score(y_test,predictions))

0.9533492822966507


In [51]:
#printing confusion_matrix
print(confusion_matrix(y_test,predictions))

[[718   7]
 [ 32  79]]
