In [4]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Dataset downloaded from: https://www.kaggle.com/datasets/mfaisalqureshi/spam-email
# Load the dataset
df = pd.read_csv('/content/spam.csv')

In [6]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Download stopwords and wordnet from NLTK
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
# Function to preprocess text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = "".join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()  # Split text into words
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    words = [nltk.stem.WordNetLemmatizer().lemmatize(word) for word in words]  # Apply lemmatizing
    return " ".join(words)

In [9]:
# Apply the preprocessing function to the 'Message' column
df['Message'] = df['Message'].apply(preprocess_text)

In [10]:
# Display the first few rows of the dataset after preprocessing
df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though


In [11]:
# Map 'ham' to 0 and 'spam' to 1 in the 'Category' column
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [12]:
# Display the first few rows of the dataset after mapping
df.head()

Unnamed: 0,Category,Message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think go usf life around though


In [13]:
# Define features (X) and target (y)
X = df['Message']
y = df['Category']

In [14]:
# Split data into training and testing sets for Naive Bayes model
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X, y, test_size=0.2, random_state=531)

In [15]:
# Create a pipeline for the Naive Bayes model
nb_pipeline = Pipeline([
    ('vect', CountVectorizer()),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Convert word counts to TF-IDF scores
    ('clf', MultinomialNB())  # Train a Naive Bayes classifier
])

In [16]:
# Train the Naive Bayes model
nb_pipeline.fit(X_train_nb, y_train_nb)

In [17]:
# Make predictions using the Naive Bayes model
nb_predictions = nb_pipeline.predict(X_test_nb)

In [18]:
# Split data into training and testing sets for SVM model
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, y, test_size=0.2, random_state=646)

In [19]:
# Create a pipeline for the SVM model
svm_pipeline = Pipeline([
    ('vect', CountVectorizer()),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Convert word counts to TF-IDF scores
    ('clf', SVC(kernel='linear'))  # Train a Support Vector Machine classifier with a linear kernel
])

In [20]:
# Train the SVM model
svm_pipeline.fit(X_train_svm, y_train_svm)

In [21]:
# Make predictions using the SVM model
svm_predictions = svm_pipeline.predict(X_test_svm)

In [22]:
# Print accuracy of the Naive Bayes model
print("Naive Bayes Model")
print("Accuracy:", accuracy_score(y_test_nb, nb_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test_nb, nb_predictions))
print("Classification Report:\n", classification_report(y_test_nb, nb_predictions))

Naive Bayes Model
Accuracy: 0.9847533632286996
Confusion Matrix:
 [[1005    0]
 [  17   93]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1005
           1       1.00      0.85      0.92       110

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.99      0.98      0.98      1115



In [23]:
# Print accuracy of the SVM model
print("SVM Model")
print("Accuracy:", accuracy_score(y_test_svm, svm_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test_svm, svm_predictions))
print("Classification Report:\n", classification_report(y_test_svm, svm_predictions))

SVM Model
Accuracy: 0.9937219730941704
Confusion Matrix:
 [[989   0]
 [  7 119]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       989
           1       1.00      0.94      0.97       126

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [24]:
import pickle

In [25]:
pickle.dump(nb_pipeline, open('nb_model.pkl', 'wb'))
pickle.dump(svm_pipeline, open('svm_model.pkl', 'wb'))