# Email Spam Detection by HARSHIT MELANI
    
 The objective of this project is to create an email spam detection system that uses machine learning algorithms to classify incoming emails. By training the model on a labeled dataset of spam and non-spam emails, we aim to develop an accurate and efficient spam detector that can reliably identify and categorize emails based on their content and characteristics.

In [1]:
#developed an email spam detection system using machine learning algorithms. The goal was to create an efficient system that can accurately classify incoming emails as either spam or non-spam (ham)

#loading libraries


In [2]:
import pandas as pd
import nltk
import numpy as np


In [3]:
import chardet

In [4]:
#Loaded the dataset 'spam.csv' using Pandas, considering the appropriate encoding.
encoding=' (Latin-1)'


df = pd.read_csv('spam.csv', encoding=encoding, sep=',')
    
  
    


In [5]:
print(df)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [6]:
#Dropped irrelevant columns ('Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4') from the DataFrame.
df1 = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
print(df1)

        v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
df1.shape

(5572, 2)

In [9]:
#Mapped 'ham' and 'spam' labels to numeric values (0 and 1).

df1.loc[df1['v1']=='ham','v1',]=0
df1.loc[df1['v1']=='spam','v1',]=1
X=df1['v2']
Y=df1['v1']

In [10]:
print(Y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: object


In [11]:
#Utilized the TF-IDF (Term Frequency-Inverse Document Frequency) vectorization to convert text data into numerical features.

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
#Split the dataset into training and testing sets using train_test_split from sklearn.model_selection.
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [13]:
vectorizer = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [14]:
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [15]:
#Employed Logistic Regression as the classifier model.
model = LogisticRegression()

In [16]:
#Fitted the model on the training data and evaluated its accuracy on both training and testing sets.
model.fit(X_train_features,Y_train)

LogisticRegression()

In [17]:
Y_pred0=model.predict(X_train_features)
acc0=accuracy_score(Y_train,Y_pred0)

In [18]:
acc0

0.9694862014808167

In [19]:
Y_pred1=model.predict(X_test_features)
acc1=accuracy_score(Y_test,Y_pred1)

In [20]:
acc1 

0.9524663677130045

In [21]:
#Created a function to classify input emails as spam or ham using the trained model and vectorizer.

input_mail = [""]

input_features=vectorizer.transform(input_mail)

pred = model.predict(input_features)

print(pred)


if(pred[0]==1):
    print('spam mail')
else:
    print('ham mail')

[0]
ham mail


In [None]:
import joblib

joblib.dump(model, 'trained_model.joblib')


In [None]:
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

In [None]:
#Creating a GUI for Spam Detection:
import tkinter as tk
from tkinter import ttk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Load the trained model and vectorizer
model = joblib.load('trained_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

# Create the GUI application
class EmailSpamDetectorApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Email Spam Detector")

        self.label = ttk.Label(root, text="Enter an email:")
        self.label.pack(pady=10)

        self.text_entry = ttk.Entry(root, width=50)
        self.text_entry.pack(pady=10)

        self.detect_button = ttk.Button(root, text="Detect", command=self.detect_spam)
        self.detect_button.pack()

    def detect_spam(self):
        input_mail = [self.text_entry.get()]

        # Transform the input using the pre-fitted vectorizer
        input_features = vectorizer.transform(input_mail)

        # Make prediction
        pred = model.predict(input_features)

        if pred[0] == 1:
            result = "spam mail"
        else:
            result = "ham mail"

        result_label = ttk.Label(self.root, text=f"This email is classified as: {result}")
        result_label.pack()

# Create the main window
root = tk.Tk()
app = EmailSpamDetectorApp(root)

# Start the GUI event loop
root.mainloop()
