In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [2]:
data = pd.read_csv('dataset/mail_data.csv')

In [3]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [4]:
email_data = data.where((pd.notnull(data)), '')

In [5]:
email_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
email_data.replace({'Category':{'ham':1, 'spam':0}}, inplace=True)

In [8]:
email_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
x = email_data['Message']
y = email_data['Category']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=16)

In [12]:
#extracting features using tfidf vectorizer to turn the words into a numerical value
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

In [14]:
#transforming the feature vectors that can be used to train the model
train_features = feature_extraction.fit_transform(x_train)
test_features = feature_extraction.transform(x_test)

#Converting the y_train and y_tes values to integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [15]:
model = LogisticRegression()
model.fit(train_features, y_train)

LogisticRegression()

In [19]:
train_pred = model.predict(train_features)
test_pred = model.predict(test_features)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print('Training Accuracy: ', (train_acc * 100))
print('Testing Accuracy: ', (test_acc * 100))

Training Accuracy:  96.90374691496523
Testing Accuracy:  96.50224215246637


In [20]:
input_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]
input_data = feature_extraction.transform(input_mail)

#Making Predictions

prediction = model.predict(input_data)

if (prediction[0]==1):
    print('Email Is Not Spam')
else:
    print('Email is Spam')

Email is Spam
