# Creating Model

In [20]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [21]:
# Download the stopwords from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
data = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
data.columns = ['label', 'text']
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
# Preprocessing and splitting the data
X = data['text']
y = data['label']

In [24]:
X.head()

Unnamed: 0,text
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
y.head()

Unnamed: 0,label
0,ham
1,ham
2,spam
3,ham
4,ham


In [26]:
# Convert labels to binary
y = y.map({'ham': 0, 'spam': 1})

In [27]:
y.head()

Unnamed: 0,label
0,0
1,0
2,1
3,0
4,0


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [29]:
# Vectorizing the text data
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [30]:
# Train a Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [31]:
# Evaluate the model
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9827709978463748


In [32]:
# Additionally, display classification metrics
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99      1202
        Spam       0.96      0.91      0.94       191

    accuracy                           0.98      1393
   macro avg       0.97      0.95      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [33]:
import pickle
# Save the best model and the vectorizer
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

print("Best model and vectorizer have been saved successfully!")


Best model and vectorizer have been saved successfully!


# Model Testing

In [42]:
import pickle

In [49]:
test_emails = [
    "Congratulations! You've won a free ticket to the Bahamas! Call now!",
    "Hey John, let's grab lunch tomorrow.",
    "Get 50% off on your next purchase, only for today!",
    "Can we reschedule our meeting to next week?",
    "Wow, You have won 50,000 Dollar"]

In [50]:
# Load the trained model and vectorizer
model = pickle.load(open('model.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

print("Model and vectorizer loaded successfully.")

Model and vectorizer loaded successfully.


In [51]:
# Preprocess and vectorize the test emails
test_emails_vec = vectorizer.transform(test_emails)

In [52]:
# Make predictions
test_predictions = model.predict(test_emails_vec)

In [53]:
# Output the results
for email, prediction in zip(test_emails, test_predictions):
    result = "Spam" if prediction == 1 else "Not Spam"
    print(f"Email: {email}\nPrediction: {result}\n")

Email: Congratulations! You've won a free ticket to the Bahamas! Call now!
Prediction: Spam

Email: Hey John, let's grab lunch tomorrow.
Prediction: Not Spam

Email: Get 50% off on your next purchase, only for today!
Prediction: Spam

Email: Can we reschedule our meeting to next week?
Prediction: Not Spam

Email: Wow, You have won 50,000 Dollar
Prediction: Spam



# Steamlit Code

In [None]:
import streamlit as st
import pickle
import nltk
from nltk.corpus import stopwords

# Ensure that nltk stopwords are downloaded
nltk.download('stopwords')

# Load the trained model and vectorizer
model = pickle.load(open('best_spam_model.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Streamlit UI
st.title("Spam Email Detection Using NLP and PyCaret")

# Input form
user_input = st.text_area("Enter the email content here", height=200)

# Detect button
if st.button("Detect Spam"):
    if user_input:
        # Preprocess the input text
        processed_input = preprocess_text(user_input)

        # Vectorize the input text
        input_vec = vectorizer.transform([processed_input])

        # Predict using the loaded model
        prediction = model.predict(input_vec.toarray())

        # Display the result
        if prediction[0] == 1:
            st.error("This is a Spam email.")
        else:
            st.success("This is not a Spam email.")


# Using Pycrate

In [None]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from pycaret.classification import setup, compare_models, finalize_model
import numpy as np

# Download the stopwords from nltk
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
data.columns = ['label', 'text']

# Preprocessing and splitting the data
X = data['text']
y = data['label']

# Convert labels to binary
y = y.map({'ham': 0, 'spam': 1})

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Vectorizing the text data
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Convert the vectorized train and test data into DataFrames for compatibility with PyCaret
X_train_df = pd.DataFrame(X_train_vec.toarray())
X_train_df['label'] = y_train.values

# Setup PyCaret with the processed training data
clf_setup = setup(data=X_train_df, target='label', train_size=0.99, preprocess=False, session_id=42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Description,Value
0,Session id,42
1,Target,label
2,Target type,Binary
3,Original data shape,"(4179, 7181)"
4,Transformed data shape,"(4179, 7181)"
5,Transformed train set shape,"(4137, 7181)"
6,Transformed test set shape,"(42, 7181)"
7,Numeric features,7180


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from pycaret.classification import setup, compare_models, finalize_model
import signal

# Download stopwords if needed
nltk.download('stopwords')

# Timeout handler
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Set the timeout handler
signal.signal(signal.SIGALRM, timeout_handler)

# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
data.columns = ['label', 'text']

# Preprocessing and splitting the data
X = data['text']
y = data['label']
y = y.map({'ham': 0, 'spam': 1})

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Vectorizing the text data
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Convert the vectorized data to DataFrames for PyCaret
X_train_df = pd.DataFrame(X_train_vec.toarray())
X_train_df['label'] = y_train.values

# PyCaret setup with preprocessed data
clf_setup = setup(data=X_train_df, target='label', preprocess=False, session_id=42)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Description,Value
0,Session id,42
1,Target,label
2,Target type,Binary
3,Original data shape,"(4179, 7181)"
4,Transformed data shape,"(4179, 7181)"
5,Transformed train set shape,"(2925, 7181)"
6,Transformed test set shape,"(1254, 7181)"
7,Numeric features,7180


In [None]:
# Define the time limit (in seconds) per model training
time_limit = 60  # Adjust as needed

best_model = None
best_accuracy = 0

try:
    # Start the timer
    signal.alarm(time_limit)

    # Compare models using PyCaret
    best_model = compare_models()

    # Disable the timer if successful
    signal.alarm(0)

    # Finalize the best model
    final_best_model = finalize_model(best_model)

    # Make predictions on the test set
    y_pred = final_best_model.predict(X_test_vec.toarray())

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best model accuracy: {accuracy}")

    # Display classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

except TimeoutException:
    print("Model comparison took too long and was skipped.")
except Exception as e:
    print(f"Error occurred: {e}")
finally:
    # Reset the alarm (important to clear the timer)
    signal.alarm(0)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.9805,0.9871,0.8791,0.9725,0.9218,0.9108,0.9133,1.646
et,Extra Trees Classifier,0.9767,0.9916,0.8302,0.9944,0.9036,0.8906,0.896,7.916
ridge,Ridge Classifier,0.9764,0.992,0.8252,0.997,0.9016,0.8885,0.8944,3.179
lr,Logistic Regression,0.9747,0.99,0.8252,0.9817,0.8961,0.8818,0.8865,2.086
xgboost,Extreme Gradient Boosting,0.973,0.9807,0.833,0.9594,0.8901,0.8749,0.8787,12.4
rf,Random Forest Classifier,0.9706,0.9898,0.7814,0.9967,0.8749,0.8586,0.8674,5.393
lightgbm,Light Gradient Boosting Machine,0.9679,0.9733,0.8356,0.92,0.8735,0.8552,0.8579,2.722
ada,Ada Boost Classifier,0.9655,0.9637,0.779,0.9536,0.856,0.8366,0.8431,5.758
dt,Decision Tree Classifier,0.9641,0.8999,0.8124,0.9093,0.856,0.8356,0.8387,2.852
gbc,Gradient Boosting Classifier,0.9583,0.9723,0.7016,0.9794,0.8152,0.7926,0.8079,14.899


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Best model accuracy: 0.9791816223977028
Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99      1202
        Spam       0.98      0.86      0.92       191

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [None]:
# Finalize the best model
final_model = finalize_model(best_model)

In [None]:
final_model

In [None]:
# Get the best model's name
best_model_name = str(best_model)
best_model_name

"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n              early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,\n              l1_ratio=0.15, learning_rate='optimal', loss='hinge',\n              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',\n              power_t=0.5, random_state=42, shuffle=True, tol=0.001,\n              validation_fraction=0.1, verbose=0, warm_start=False)"

In [None]:
# Evaluate the model on the test data
y_pred = final_model.predict(X_test_vec.toarray())
print("Accuracy on test data:", accuracy_score(y_test, y_pred))

Accuracy on test data: 0.9791816223977028


In [None]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99      1202
        Spam       0.98      0.86      0.92       191

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [None]:
test_emails = [
    "Congratulations! You've won a free ticket to the Bahamas! Call now!",
    "Hey John, let's grab lunch tomorrow.",
    "Get 50% off on your next purchase, only for today!",
    "Can we reschedule our meeting to next week?",
    "Get your coupon of 30% off"
]

In [None]:
# Preprocess and vectorize the test emails
test_emails_vec = vectorizer.transform(test_emails)

In [None]:
# Make predictions with the best model
test_predictions = final_model.predict(test_emails_vec.toarray())

In [None]:
# Output the results
for email, prediction in zip(test_emails, test_predictions):
    result = "Spam" if prediction == 1 else "Not Spam"
    print(f"Email: {email}\nPrediction: {result}\n")


Email: Congratulations! You've won a free ticket to the Bahamas! Call now!
Prediction: Not Spam

Email: Hey John, let's grab lunch tomorrow.
Prediction: Not Spam

Email: Get 50% off on your next purchase, only for today!
Prediction: Not Spam

Email: Can we reschedule our meeting to next week?
Prediction: Not Spam

Email: Get your coupon of 30% off
Prediction: Not Spam

