# Import the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data collection and pre-processing

In [2]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
raw_mail_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [5]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# removing useless columns

cols = [2,3,4]
mail_data.drop(mail_data.columns[cols],axis=1,inplace=True)

In [7]:
mail_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

# Label Encoding

In [9]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['v1'] == 'spam', 'v1',] = 0
mail_data.loc[mail_data['v1'] == 'ham', 'v1',] = 1

In [10]:
X = mail_data['v2']

Y = mail_data['v1']

# Splitting the data into training data and testing data


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [12]:
# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [13]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


# Feature Extraction and building the model

In [14]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

from sklearn.pipeline import make_pipeline

model = LogisticRegression()
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

pipe = pipe = make_pipeline(feature_extraction, model)
pipe.fit(X_train, Y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(lowercase='True', stop_words='english')),
                ('logisticregression', LogisticRegression())])

In [15]:
y_pred = pipe.predict(X_test)

In [16]:
# accuracy on testing data

pipe.score(X_test, Y_test)

0.9623318385650225

# Building a Predictive System

In [17]:
input_mail = ["Free entry into kuner hotel coupon"]

prediction = pipe.predict(input_mail)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


# Saving the Model

In [18]:
import joblib

with open('Spam_mail_model.joblib','wb') as f:
    joblib.dump(pipe,f)