Importing The Dependencies

In [69]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [70]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('mail_data.csv')

Print DataSet

In [71]:
 print(raw_mail_data)

      Category                                            Message
0          ham  Go until jurong point, crazy.. Available only ...
1          ham                      Ok lar... Joking wif u oni...
2         spam  Free entry in 2 a wkly comp to win FA Cup fina...
3          ham  U dun say so early hor... U c already then say...
4          ham  Nah I don't think he goes to usf, he lives aro...
...        ...                                                ...
10738      ham  put the 10 on the ft\r\nthe transport volumes ...
10739      ham  3 / 4 / 2000 and following noms\r\nhpl can ' t...
10740      ham  calpine daily gas nomination\r\n>\r\n>\r\njuli...
10741      ham  industrial worksheets for august 2000 activity...
10742     spam  important online banking alert\r\ndear valued ...

[10743 rows x 2 columns]


In [72]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [73]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
# check the number of rows and columns in the dataframe
mail_data.shape

(10743, 2)

Label Encoding

In [75]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

spam  -  0 , ham  -  1

In [76]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

Print Message

In [77]:
#n print(X)

Print Category

In [78]:
#n print(Y)

Splitting the Dataset

In [79]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [80]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(10743,)
(8594,)
(2149,)


Feature Extraction

In [81]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [82]:
#n print(X_train)

In [83]:
#n print(X_train_features)

Training the Model

Logistic Regression.

In [84]:
model = LogisticRegression()

In [85]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [86]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

Trainning Accuracy

In [87]:
print('Accuracy : ', accuracy_on_training_data*100 )

Accuracy :  95.50849429834769


In [88]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

Test Accuracy

In [89]:
print('Accuracy : ', accuracy_on_test_data*100)

Accuracy :  92.60120986505352


Building a predective System.

In [90]:
# input_mail = [input("Enter E-mail")]
# input_mail = [" Input E-Mail"]

# # convert text to feature vectors
# input_data_features = feature_extraction.transform(input_mail)

# # making prediction

# prediction = model.predict(input_data_features)
# print(prediction)


# if (prediction[0]==1):
#   print('Ham mail')

# else:
#   print('Spam mail')

Display Output in the Seperate UI

In [91]:
import tkinter as tk
from tkinter import Text, Label, Button, messagebox

# Load your pre-trained model and feature extraction if needed
# model = joblib.load("your_model_file.pkl")
# feature_extraction = joblib.load("your_tfidf_vectorizer.pkl")

def classify_email():
    email_text = email_entry.get("1.0", "end-1c").strip()  # Get the email text from the Text widget and remove leading/trailing whitespace

    # Split the email text into words
    words = email_text.split()

    if not email_text or len(words) < 5:
        # Show an error message if the input field is empty or has fewer than 5 words
        messagebox.showerror("Error", "Please enter a valid email ", icon='error')
    else:
        # Transform the input email text into feature vectors
        input_mail1 = [email_text]
        input_data_features = feature_extraction.transform(input_mail1)

        # Make a prediction
        prediction = model.predict(input_data_features)

        if prediction[0] == 1:
            result_label.config(text='Ham mail', fg='green')
        else:
            result_label.config(text='Spam mail', fg='red')

# Create the main window
window = tk.Tk()
window.title("Spam Email Detector")  # Set the title to "Spam Email Detector"
window.configure(bg='black')  # Set the background color to black

# Add a margin between the window and the input box
window.geometry("400x350")  # Set the window size

# Create a Label for the title within the window
title_label = Label(window, text="Spam Email Detector", font=('Helvetica', 16), bg='black', fg='white')
title_label.pack()

# Create a Text widget for entering email text with resizing options and margin
email_label = Label(window, text="Enter Email:", bg='black', fg='white')
email_label.pack()
email_entry = Text(window, height=10, width=40)
email_entry.pack(fill='both', expand=True, padx=10, pady=10)  # Auto-resize with the window and add margin

# Create a button for classification with custom color and added margin
classify_button = Button(window, text="Classify Email", command=classify_email, padx=10, pady=5, bg='#4CAF50', fg='white')
classify_button.pack(pady=10)  # Add margin (space) below the button

# Create a label to display the result
result_label = Label(window, text='', font=('Helvetica', 14), bg='black', fg='white')
result_label.pack(pady=10)  # Add margin (space) below the result label

# Run the tkinter main loop
window.mainloop()