<a href="https://colab.research.google.com/github/MayThiriKyaw2310/spam_mail_detection/blob/main/Spam_Email_Detection_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix,precision_recall_curve,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import string
import nltk
from nltk.stem import PorterStemmer
import joblib

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv('/content/contact_data_general_questions.csv')
df.head()
print(df.isnull().values.any())
print(df.isnull().sum())

True
ID          0
Name        0
Email       0
Details     0
Message     0
Address    49
label       0
dtype: int64


In [None]:
#Handling missing values
df['Address'].fillna('Unknown Address', inplace=True)
print(df)

df['Message'] = df['Message'].apply(lambda x: x.replace('\r\n', ''))
df.info()
df['label'].value_counts()

    ID              Name                                              Email  \
0    1          jane doe                                  janedoe@gmail.com   
1    2  Richard Davidson                           leadingai@dollartip.info   
2    3      Jason Groves  aiinteractivebookscommercial@growthmarketingno...   
3    4   William Coleman                  aitools@getmoreopportunities.info   
4    5       Sarah Blake                     sarah.blake@techinnovators.com   
..  ..               ...                                                ...   
71  72        Htet Paing                            htetpaing.hpo@gmail.com   
72  73           Micheal                              waiyansm.sm@gmail.com   
73  74            thumin                                   thumin@gmail.com   
74  75            thumin                                   thumin@gmail.com   
75  76            thumin                                   thumin@gmail.com   

                          Details  \
0             

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Address'].fillna('Unknown Address', inplace=True)


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,39
1,37


In [None]:
#Defining a custom stopwords set
stopwords_set = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into", "through",
    "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then",
    "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no",
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
    "t", "can", "will", "just", "don", "should", "now"
}

stemmer = PorterStemmer()

corpus = []

for i in range(len(df)):
    text = df['Message'].iloc[i].lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)

    corpus.append(text)

print(f"Length of df: {len(df)}")
print(f"Length of corpus: {len(corpus)}")

#storing the processed messages in the DataFrame
if len(corpus) == len(df):
    df['Processed_Message'] = corpus
else:
    print("Error: The lengths of corpus and DataFrame do not match!")

print(df[['Message', 'Processed_Message']].head())


Length of df: 76
Length of corpus: 76
                                             Message  \
0                                            mike tl   
1  Hi kalasa.gallery, Are you ready to take your ...   
2  Hey, Guess What ? Now You Launch Your Own 6-Fi...   
3  Hi, Imagine having access to the world's leadi...   
4   We are excited to announce the launch of our ...   

                                   Processed_Message  
0                                            mike tl  
1  hi kalasagalleri readi take busi next level po...  
2  hey guess launch 6figur whitelabel lifelik int...  
3  hi imagin access world lead ai tool without ha...  
4  excit announc launch new aipow platform design...  


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Processed_Message']).toarray()
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

svm_model = SVC(kernel='linear', C=10, probability=True, random_state=42)
cv_scores = cross_val_score(svm_model, X, y, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

# Defining a parameter grid to search over
param_grid = {
    'C': [1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf'],
    'probability': [True]
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

# Get the best model from grid search
best_svm_model = grid_search.best_estimator_

Cross-validation scores: [0.9375     0.93333333 1.         0.86666667 0.93333333]
Mean cross-validation score: 0.9341666666666667
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear', 'probability': True}


In [None]:
# Train the model
best_svm_model.fit(X_train, y_train)
probabilities = best_svm_model.predict_proba(X_test)
print(probabilities)

[[0.09872138 0.90127862]
 [0.14355136 0.85644864]
 [0.68682755 0.31317245]
 [0.66422146 0.33577854]
 [0.84213314 0.15786686]
 [0.15846015 0.84153985]
 [0.75561764 0.24438236]
 [0.93546708 0.06453292]
 [0.2248984  0.7751016 ]
 [0.9718391  0.0281609 ]
 [0.82440492 0.17559508]
 [0.17192968 0.82807032]
 [0.75212716 0.24787284]
 [0.90980729 0.09019271]
 [0.26436746 0.73563254]
 [0.10146196 0.89853804]
 [0.27709842 0.72290158]
 [0.2484543  0.7515457 ]
 [0.04559445 0.95440555]
 [0.92805831 0.07194169]
 [0.27551738 0.72448262]
 [0.9718391  0.0281609 ]
 [0.60943692 0.39056308]]


In [None]:
# Predict on the test set
y_pred = best_svm_model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Non-spam", "Spam"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

    Non-spam       1.00      1.00      1.00        12
        Spam       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Confusion Matrix:
[[12  0]
 [ 0 11]]


In [None]:
y_probs = best_svm_model.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

optimal_threshold = thresholds[np.argmax(2 * (recall * precision) / (recall + precision))]
print(f"Optimal Threshold: {optimal_threshold}")

new_threshold = 0.65
y_pred = (y_probs >= new_threshold).astype(int)
print("Classification Report with Threshold:")
print(classification_report(y_test, y_pred))


Optimal Threshold: 0.7229015840227548
Classification Report with Threshold:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



In [None]:
# Preprocessing the input email
email_to_classify = df.Processed_Message[0]
email_message = email_to_classify.lower().translate(str.maketrans("", "", string.punctuation)).split()
email_message = [stemmer.stem(word) for word in email_message if word not in stopwords_set]
email_message = ' '.join(email_message)

email_corpus = [email_message]
X_email = vectorizer.transform(email_corpus)
X_email_dense = X_email.toarray()

prediction =  best_svm_model.predict(X_email_dense)
print("Prediction:", prediction)

if prediction[0] == 1:
    print("This email is classified as spam.")
else:
    print("This email is classified as non-spam.")

Prediction: [0]
This email is classified as non-spam.


In [None]:
joblib.dump(svm_model, "svm_spam_model1.pkl")
joblib.dump(vectorizer, "vectorizer1.pkl")
print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


In [None]:
joblib.dump(best_svm_model, "svm_spam_model_best.pkl")
print("Best model saved successfully!")


Best model saved successfully!
