In [None]:
import numpy as np # comments in this guide will be added like this
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn import metrics
from sklearn import datasets
%matplotlib inline

# Importing the data set

In [None]:
# read_cvs is a pandas function that is used to read cvs files and do operations on it
df = pd.read_csv('../input/spam-filter-lm/mail_data.csv')

#this function returns the first (n)5 rows
print(df)

In [None]:
# replace the null value with a null string
mail_df = df.where((pd.notnull(df)),'')

In [None]:
#listing the first 5 rows of dataset
mail_df.head()

In [None]:
#checking number of rows and columns in the data set
mail_df.shape

# Converting Ham and Spam to 1 and 0

In [None]:
#convert labels to numerical values, so spam mail = 0 and ham mail = 1 mail_df.loc locates the values in the category column that are labeled as spam and replaces them with 0
mail_df.loc[mail_df['Category'] == 'spam', 'Category',] = 0
mail_df.loc[mail_df['Category'] == 'ham', 'Category',] = 1

# Split the data into X and Y

In [None]:
# separates the data as texts and label
X = mail_df['Message']
Y = mail_df['Category']

In [None]:
print(X)

In [None]:
print(Y)

# Splitting the data into 70/30 split

In [None]:
#splitting the data into training data and test data. test_size is the amount of data used in the test (30%), random_test splits the data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3,random_state=0)

**This shows the split of the data**

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

**This turned the data into feaure vectors, removed stop words and changed everything to lowercase. Changing everything to lowercase makes sure that every occurance of the word is treated equally.**

In [None]:
#Feature extraction - transforms the text data into feature vectors that can be used in logistic regression. TfidVectorizer It looks at the data and gives value to the words present in the dataset.
#min_df=1 means that if a word is repeated less than once, we ignore it
#stop_words='english' ignores words that arent important like the, did etc
#lowercase changes the words into lowercase letters
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train) #prints the untransformed values

In [None]:
print(X_train_features) # prints the transformed values

# Algorithm used: Logistic Regression

In [None]:

model = LogisticRegression()

In [None]:
#Train the ML Model = logistic regression with the training data
model.fit(X_train_features, Y_train)

#  **Accuracy of Algorithm****

In [None]:
#Evaluating the trained model
#predict on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('accuracy on training data', accuracy_on_training_data)

In [None]:
#evaluate with the test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('accuracy on test data', accuracy_on_test_data)

Accuracy on the test data is slightly lower than the training data.

# **Using confusion matrix to determine the accuracy of the algorithm**

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(Y_test, prediction_on_test_data)
cnf_matrix

In [None]:
print("Accuracy:",metrics.accuracy_score(Y_test, prediction_on_test_data))
print("Precision:",metrics.precision_score(Y_test, prediction_on_test_data))
print("Recall:",metrics.recall_score(Y_test, prediction_on_test_data))

# F1 Score - 0.97

**This was used because it gives the weighted average of precision and recall, takes both FP and FN into account. It is best used when there is an imbalanced data set**

In [None]:
from sklearn.metrics import f1_score
print('F1 is: ', f1_score(Y_test, prediction_on_test_data))

# Predictive system

**Any of the emails from the dataset can be placed in the input and it will predict whether or not the email is spam or ham**

In [None]:
#Building a predictive system
input_mail =["Ahhh. Work. I vaguely remember that! What does it feel like? Lol"]
#converts text to feature vector
input_mail_features = feature_extraction.transform(input_mail)
prediction = model.predict(input_mail_features)
print(prediction)
#prints the fisrt value in the list
if prediction[0] == 1:
    print('Not Spam')
    
else:
    print('Spam')