<a href="https://colab.research.google.com/github/MayThiriKyaw2310/spam_mail_detection/blob/main/Untitled21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix,precision_recall_curve
from sklearn.model_selection import cross_val_score
from nltk.stem import PorterStemmer
from sklearn.model_selection import GridSearchCV
import string


df = pd.read_csv('/content/contact_data_general_questions.csv')
df.head()

print(df.isnull().values.any())
print(df.isnull().sum())

df['Address'].fillna('Unknown Address', inplace=True)
print(df)

df['Message'] = df['Message'].apply(lambda x: x.replace('\r\n', ''))
df.info()
df['label'].value_counts()

#Defining a custom stopwords set
stopwords_set = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into", "through",
    "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then",
    "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no",
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
    "t", "can", "will", "just", "don", "should", "now"
}

stemmer = PorterStemmer()

corpus = []

for i in range(len(df)):
    text = df['Message'].iloc[i].lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)

    corpus.append(text)

print(f"Length of df: {len(df)}")
print(f"Length of corpus: {len(corpus)}")

#storing the processed messages in the DataFrame
if len(corpus) == len(df):
    df['Processed_Message'] = corpus
else:
    print("Error: The lengths of corpus and DataFrame do not match!")

print(df[['Message', 'Processed_Message']].head())

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Processed_Message']).toarray()
y = df['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


svm_model = SVC(C=1, gamma='scale', kernel='rbf', probability=True, random_state=42)
cv_scores = cross_val_score(svm_model, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

# Train the model
svm_model.fit(X_train, y_train)
probabilities = svm_model.predict_proba(X_test)
print(probabilities)

# Define a parameter grid to search over
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")


# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Non-spam", "Spam"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

y_probs = svm_model.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

optimal_threshold = thresholds[np.argmax(2 * (recall * precision) / (recall + precision))]
print(f"Optimal Threshold: {optimal_threshold}")

#Setting the threshold
threshold = 0.65
y_pred = (y_probs >= threshold).astype(int)

print("Classification Report with Threshold:")
print(classification_report(y_test, y_pred))

True
ID          0
Name        0
Email       0
Details     0
Message     0
Address    49
label       0
dtype: int64
    ID              Name                                              Email  \
0    1          jane doe                                  janedoe@gmail.com   
1    2  Richard Davidson                           leadingai@dollartip.info   
2    3      Jason Groves  aiinteractivebookscommercial@growthmarketingno...   
3    4   William Coleman                  aitools@getmoreopportunities.info   
4    5       Sarah Blake                     sarah.blake@techinnovators.com   
..  ..               ...                                                ...   
70  71        Bhone Myat                            bonemyatp2020@gmail.com   
71  72        Htet Paing                            htetpaing.hpo@gmail.com   
72  73           Micheal                              waiyansm.sm@gmail.com   
73  74            thumin                                   thumin@gmail.com   
74  75         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Address'].fillna('Unknown Address', inplace=True)


Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Classification Report:
              precision    recall  f1-score   support

    Non-spam       1.00      1.00      1.00        12
        Spam       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Confusion Matrix:
[[12  0]
 [ 0 11]]
Optimal Threshold: 0.6225312014862918
Classification Report with Threshold:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        12
           1       1.00      0.73      0.84        11

    accuracy                           0.87        23
   macro avg       0.90      0.86      0.87        23
weighted avg       0.90      0.87      0.87        23



In [42]:
# Preprocess the input email
email_to_classify = df.Processed_Message[0]
email_message = email_to_classify.lower().translate(str.maketrans("", "", string.punctuation)).split()
email_message = [stemmer.stem(word) for word in email_message if word not in stopwords_set]
email_message = ' '.join(email_message)

# Create corpus for the input email
email_corpus = [email_message]

# Transform using the vectorizer
X_email = vectorizer.transform(email_corpus)

# Convert sparse matrix to dense array
X_email_dense = X_email.toarray()

# Predict using the trained SVM model
prediction = svm_model.predict(X_email_dense)

# Print the prediction
print("Prediction:", prediction)


Prediction: [0]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [41]:
if prediction[0] == 1:
    print("This email is classified as spam.")
else:
    print("This email is classified as non-spam.")

This email is classified as non-spam.
