In [1]:
from statistics import mode
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Reading data from csv file
raw_mail_data = pd.read_csv('E:\\Spam_spam_detector\\spam.csv', encoding='ISO-8859-1')

mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')  # Converting null data to null strings


In [3]:

# Making spam as 0 and ham as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1


In [4]:

X = mail_data['Message']
Y = mail_data['Category']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=2)  # Splitting the data in train/test with 85/15%
tfidf_vectorizer = TfidfVectorizer(lowercase=True)
X_train_features = tfidf_vectorizer.fit_transform(X_train)  # Transform data to numbers to make it easy to read for computers
X_test_features = tfidf_vectorizer.transform(X_test)  # Use transform, not fit_transform for the test data

Y_train = Y_train.astype('int')  # Converting Category data 1/0 from string to int
Y_test = Y_test.astype('int')


In [5]:

# Training the model
model = LogisticRegression()
model.fit(X_train_features, Y_train)

In [6]:
# Predicting on the test data
prediction_on_train_data = model.predict(X_train_features)
accuracy_on_train_data = accuracy_score(Y_train, prediction_on_train_data)
print('Accuracy on train data: ', accuracy_on_train_data * 100, '%')

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data: ', accuracy_on_test_data * 100, '%')


Accuracy on train data:  97.80405405405406 %
Accuracy on test data:  96.5311004784689 %


In [7]:

# Testing on new data
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Even my brother is not like to speak with me. They treat me like aids patent."]
input_mail_features = tfidf_vectorizer.transform(input_mail)  # Use transform, not fit_transform for new data
prediction = model.predict(input_mail_features)  # Predicting data

for i in range(2):
    if prediction[i] == 0:
        print('"', input_mail[i], '" is SPAM MAIL.')
    else:
        print('"', input_mail[i], '" is NON-SPAM MAIL.')


" Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's " is SPAM MAIL.
" Even my brother is not like to speak with me. They treat me like aids patent. " is NON-SPAM MAIL.
