In [1]:
import tkinter.ttk
import matplotlib.pyplot as plt
import numpy as np
from tkinter import *
from tkinter.filedialog import askopenfile
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [2]:
def load_models():
    file = open(r'models\LRC_model.pkl', 'rb')
    LRC= pickle.load(file)   
    file = open(r'models\SVC_model.pkl', 'rb')
    SVC= pickle.load(file) 
    file = open(r'models\DTC_model.pkl', 'rb')
    DTC= pickle.load(file) 
    file = open(r'models\KNN_model.pkl', 'rb')
    KNN= pickle.load(file) 
    file = open(r'models\RFC_model.pkl', 'rb')
    RFC= pickle.load(file)
    file = open(r'models\NBC_model.pkl', 'rb')
    NBC= pickle.load(file) 
    return LRC,SVC,DTC,KNN,RFC,NBC
LRC,SVC,DTC,KNN,RFC,NBC =load_models()

In [3]:
def preprocess(email):
    email=re.sub("^Subject: ","",email) #remove (Subject: )
    email=re.sub("[^a-zA-Z]"," ",email) #remove special characters
    #email=re.sub("\s\w\s","",email) # remove s in 's and t in 't (like book's cover or he can't)
    email=re.sub("^\s+","",email) #remove leading space
    email=re.sub("\s+$","",email) #remove trailing space
    email=re.sub("\s+"," ",email) #remove extra spaces between words
    email=email.lower()           #lowercase every word

    return email
def tokenize(email):
     list_of_words=nltk.word_tokenize(email)
     return list_of_words
def remove_stopwords(email):
   clean_words=[]
   list_of_words=tokenize(email)

   for word in list_of_words:
      if(word not in stopwords.words('english')):
         clean_words.append(word)

   email=' '.join(clean_words) #convert list to string with seperator between every element (' ')

   return email
def lemmatize_email(email):
    lemmatized_words=[]

    lemmatizer=WordNetLemmatizer()
    for word in tokenize(email):
        new_word=lemmatizer.lemmatize(word)
        lemmatized_words.append(new_word)
    email=' '.join(lemmatized_words)
    return email 

In [4]:
def preprocess_email(email):
    preprocessed_email=preprocess(email)
    preprocessed_email=remove_stopwords(preprocessed_email)
    preprocessed_email=lemmatize_email(preprocessed_email)
    file = open('vectorizer.pkl', 'rb')
    vectorizer = pickle.load(file)
    # Fit and transform text data
    tfidf_matrix = vectorizer.transform([preprocessed_email])
    vector=tfidf_matrix.toarray()
    return vector

In [5]:
def get_label(encoded_label):
    label=None
    if encoded_label[0][0]==0 and encoded_label[0][1]==1:
        label='spam'
    elif encoded_label[0][0]==1 and encoded_label[0][1]==0:
        label='ham'
    return label       

In [9]:
root=Tk()
root.config(background='gray')
root.title('Email Spam Detection')
root.geometry('800x500+250+100')
main_fw=Frame(root,bg='gray')
model_lbl=Label(main_fw,width=10,height=1,bg='gray',pady=5,text='Model',font=30)
model_lbl.grid(column=1,row=1)
model_box=tkinter.ttk.Combobox(main_fw,width=27,height=30)
model_box['values']=('Auto','Logistic Regression','SVM','Decision Tree','KNN','Random Forest','Naive Bayes')
model_box.grid(column=1, row=2)
model_box.current()
email_lbl=Label(main_fw,width=10,height=1,bg='gray',pady=5,text='Email',font=30)
email_lbl.grid(column=1,row=3)
email_box = Text(main_fw, width=60, height=10)
email_box.grid(column=1, row=4,pady=5)
def detect_email():
    label=None
    result_lbl=Label(main_fw,width=10,height=1,bg='gray',pady=5,text='Result',font=30)
    result_lbl.grid(column=1,row=10)
    value_lbl=Label(main_fw,width=10,height=1,bg='black',fg='white',pady=5,text=label,font=30)
    value_lbl.grid(column=1,row=11) 
    value_lbl.config(text=label) 
    email=email_box.get(1.0,"end-1c")
    email=str(email)
    preprocessed_email=preprocess_email(email)
    model=model_box.get()
    if model=='Auto' or model=='SVM':
        pred=SVC.predict(preprocessed_email)
        label=get_label(pred)
    elif model=='Logistic Regression':
        pred=LRC.predict(preprocessed_email)
        label=get_label(pred)
    elif model=='Decision Tree':
        pred=DTC.predict(preprocessed_email)
        label=get_label(pred)
    elif model=='KNN':
        pred=KNN.predict(preprocessed_email)
        label=get_label(pred)
    elif model=='Random Forest':
        pred=RFC.predict(preprocessed_email)
        label=get_label(pred)
    elif model=='Naive Bayes':
        pred=NBC.predict(preprocessed_email)
        label=get_label(pred)
    if label=='ham':
        value_lbl.config(text=label,bg='green')
    elif label=='spam':
        value_lbl.config(text=label,bg='red')    

predict_btn=Button(main_fw,text='Predict',width=30,height=2,
                    command=detect_email,bg='black',fg='white',font=30,pady=5)
predict_btn.grid(column=1,row=8,pady=20)



main_fw.pack()
root.mainloop()

