Spam Mail Detection

In [1]:
# importing the necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading and preprocessing of the dataset
mail_data = pd.read_csv("Data_set\\mail_data.csv")

In [3]:
# finding and replacing the missing values
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [4]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# no of rows and columns
mail_data.shape

(5572, 2)

In [6]:
mail_data["Category"].value_counts()

# so there are two types of data so we convert the textual data into the numerical byl abel encoding

Category
ham     4825
spam     747
Name: count, dtype: int64

ham -- 0
spam -- 1

In [7]:
encoder = LabelEncoder()

In [8]:
mail_data["Category"] = encoder.fit_transform(mail_data["Category"])

In [9]:
mail_data["Category"].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [10]:
X = mail_data["Message"]

In [11]:
Y = mail_data["Category"]

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [13]:
print(Y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


In [14]:
# Splitting the data into the training and testing

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [15]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(3900,)
(1672,)


In [16]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(3900,)
(1672,)


In [17]:
# feature extraction (tfidfvectorization) :Convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer(min_df = 1,stop_words="english",lowercase=True)


# converting the X data into the vectorizer format and Y data into the integers because the Y data is in Object/string format

X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)


Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

- min_df = 1 --> include the words only which are values > 1
- stopwords -->    exclude the words like is was are ....
- lowercase -->    convert into the lowercase


In [18]:
print(X_train_features.shape)
print(X_test_features.shape)

(3900, 7003)
(1672, 7003)


In [19]:
# training the logistic reggression model

model = LogisticRegression()

In [20]:
# training the model with the training data

model.fit(X_train_features ,Y_train)

Evaluating the model

In [21]:
# prediction the training model
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [22]:
print(accuracy_on_training_data)
# so we got the accuracy of 96% on training data

0.9628205128205128


In [23]:
# prediction the testing model
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)


In [24]:
print(accuracy_on_test_data)

# so we got the accuracy of 96% on test data also

0.9659090909090909


So , why we had done the accuracy score on the both training and testing means in some cases the overfitting may arise i.e, in training the accuracy is high but when it comes to the testing the accuracy will be very low than the training accuracy

Building a predictive model for custom inputs

In [None]:
def predict():
    input_mail = input("Enter the email content: ")
    input_data = vectorizer.transform([input_mail])  
    prediction = model.predict(input_data)
    print(prediction)
    
    
    if prediction == 1:
        print("Result: ❌ Spam")
    else:
        print("Result: ✅ Ham (Not Spam)")


 - `0 -- ham`
 - ` 1 -- spam`

In [30]:
predict()

[1]
Result: ❌ Spam
