In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
best_model = LinearSVC()  
import joblib

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv("emailspam.csv")

In [4]:
df

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
83443,0,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...


In [5]:
df.columns

Index(['label', 'text'], dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)
df['Clean_Text'] = df['text'].apply(preprocess)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Clean_Text'])
y=df["label"]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVC": best_model
}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    print(f"\n {name}")
    print(f"Training Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}")
    print(f"Testing  Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_test_pred))


 Logistic Regression
Training Accuracy: 0.9889, F1 Score: 0.9889
Testing  Accuracy: 0.9856, F1 Score: 0.9856
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      7938
           1       0.98      0.99      0.99      8752

    accuracy                           0.99     16690
   macro avg       0.99      0.99      0.99     16690
weighted avg       0.99      0.99      0.99     16690


 Naive Bayes
Training Accuracy: 0.9820, F1 Score: 0.9820
Testing  Accuracy: 0.9766, F1 Score: 0.9766
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      7938
           1       0.99      0.97      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690


 Linear SVC
Training Accuracy: 0.9991, F1 Score: 0.9991
Testing  Accuracy: 0.9902, F1

In [12]:

sample_email = input("Enter email content to classify: ")
cleaned_input = preprocess(sample_email)
input_transformed = vectorizer.transform([cleaned_input])
prediction = best_model.predict(input_transformed)[0]
print("\n This email is SPAM." if prediction == 1 else "\n This email is NOT SPAM.")

Enter email content to classify:  Subject: Your Order Has Been Placed Successfully  Hi Priya,  Thank you for shopping with us! Your order #456123 has been confirmed and is now being processed. You’ll receive another email once your items have been shipped.  Estimated delivery: April 9, 2025   Items: 2   Total: ₹1,299.00  Thank you for choosing SnapBuy.  Regards,   SnapBuy Team



 This email is NOT SPAM.


In [13]:
sample_email = input("Enter email content to classify: ")
cleaned_input = preprocess(sample_email)
input_transformed = vectorizer.transform([cleaned_input])
prediction = best_model.predict(input_transformed)[0]
print("\n This email is SPAM." if prediction == 1 else "\n This email is NOT SPAM.")

Enter email content to classify:  Subject: Invoice #987456 from QuickBooks  Hello,  Please find the attached invoice for your recent purchase. Your payment is due in 24 hours to avoid penalties.  Download Invoice: http://quickbooks-payments.com/invoice987456  Thank you for your prompt attention.  Sincerely,   QuickBooks Billing Team



 This email is SPAM.
