# Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data collection and preprocessing

In [3]:
row_mail_data = pd.read_csv('mail_data.csv') 
row_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# replace null values  with a null string

In [4]:
mail_data = row_mail_data.where((pd.notnull(row_mail_data)),'')

In [6]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
mail_data.shape

(5572, 2)

# Label Encoding

0---> spam 

1 --> ham /No spam

In [13]:
mail_data.loc[mail_data['Category'] == 'spam','Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category',] = 1

In [14]:
X = mail_data['Message']
Y = mail_data['Category']

In [15]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [18]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [19]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


# Feature Exraction

In [20]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english' ,lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test  = Y_test.astype('int')

In [33]:
#print(X_train)
#print(X_train_features)

# Training the Model /LogisticRegression.

In [23]:
model = LogisticRegression()
model.fit(X_train_features,Y_train)

LogisticRegression()

In [26]:
predict_training_data  =  model.predict(X_train_features)
accuracy_training_data =  accuracy_score(Y_train,predict_training_data) 

In [30]:
print('Accuracy of traning data :',accuracy_training_data)

Accuracy of traning data : 0.9670181736594121


In [31]:
predict_testing_data  =  model.predict(X_test_features)
accuracy_testing_data =  accuracy_score(Y_test,predict_testing_data) 

In [32]:
print('Accuracy of testing data :',accuracy_testing_data)

Accuracy of testing data : 0.9659192825112107


# Buildin a prediction system

In [38]:
input_mail = ["Did I forget to tell you ? I want you , I need you, I crave you ... But most of all ... I love you my sweet Arabian steed ... Mmmmmm ... Yummy"]

#convert text to features vectors
input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)
print(prediction)    

[1]


In [41]:
if prediction[0] == 1 :
    print('This one is to be Ham Mail')
else:
    print('This one is to be Spam Mail')

This one is to be Ham Mail
