# Spam detection

In [51]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')

In [52]:
# Reading the dataset
df = pd.read_csv(r"D:\projects\Spam_Ham\Dataset\spam_dataset.csv", encoding='latin1')

In [53]:
# Dropping unnecessary columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [54]:
# Checking dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [55]:
# Replacing 'ham' and 'spam' labels with binary values (0 and 1)
df.replace(to_replace={'ham': 0, 'spam': 1}, inplace=True)

In [56]:
# Checking the distribution of 'ham' and 'spam' in the dataset
df['v1'].value_counts()

v1
0    4825
1     747
Name: count, dtype: int64

In [57]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['v1'] == 0]
df_minority = df[df['v1'] == 1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # match minority class
                                   random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled['v1'].value_counts()

v1
0    747
1    747
Name: count, dtype: int64

In [58]:
# Splitting the dataset into features (X) and labels (y)
X,y = df_downsampled.iloc[:,1],df_downsampled.iloc[:,0].astype(int)

In [59]:
# Initializing the TF-IDF Vectorizer
Vectorizer = TfidfVectorizer()

# Transforming the text data to TF-IDF features
Transformed_X = Vectorizer.fit_transform(X)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(Transformed_X, y)

In [61]:
# Importing machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [62]:
# Defining parameter distributions for RandomizedSearchCV
param_dists = {
    'Logistic Regression': {
        'penalty': ['l1', 'l2'],
        'C':  [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 200, 500]
        
    },
    'Random Forest': {
        'n_estimators': [50,100,150,175,200,250,300,400,450],
        'max_depth': [None] + list(range(5, 50)),
        'min_samples_split': [2,4,5,7,8,10,15,20],
        'min_samples_leaf': [1,2,4,5,7,8,10,15,20],
        'criterion': ['gini', 'entropy'],
    },
    'Decision Tree': {
        'max_depth': [None] + list(range(5, 50)),
        'min_samples_split': [2,4,5,7,8,10,15,20],
        'min_samples_leaf': [1,2,4,5,7,8,10,15,20],
        'criterion': ['gini', 'entropy'],
    },
    'SVC': {
        'C': [0.1,2,3,4,5,6,8,10],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'] ,
        'degree': [2,3,4,5,6,7,8,9,10],
    },
    'Multinomial Naive Bayes': {
        'alpha': [1e-10,1e-5,1e-3,0.01,.1],
        'fit_prior': [True, False],
    }
}


In [63]:
# Instantiate models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVC': SVC(),
    'Multinomial Naive Bayes': MultinomialNB()
}

In [64]:
# Dictionary to store the trained models
Trained_model = {}

In [65]:
# Tuning hyperparameters and training the models
for model_name, model in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    random_search = RandomizedSearchCV(model, param_dists[model_name], n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search.fit(X_train,y_train)
    Trained_model[model_name] = random_search.best_estimator_
    print(f"Best parameters for {model_name}: {random_search.best_params_}")
    print(f"Best score for {model_name}: {random_search.best_score_}")
    print()

Tuning hyperparameters for Logistic Regression...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 200, 'C': 100}
Best score for Logistic Regression: 0.9553626471305071

Tuning hyperparameters for Random Forest...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 45, 'criterion': 'entropy'}
Best score for Random Forest: 0.9508895929807458

Tuning hyperparameters for Decision Tree...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters for Decision Tree: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 19, 'criterion': 'gini'}
Best score for Decision Tree: 0.9232054020730885

Tuning hyperparameters for SVC...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters for SVC: {'kernel': 'sigmoid', 'gamma

In [66]:
# Importing metrics for evaluation
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix

In [67]:
# Evaluating the models on the test set
for model_name, model in Trained_model.items():
    
    y_pred = model.predict(X_test)
    print(model_name)
    print(f"The accuaracy of the {model_name} model is {accuracy_score(y_test,y_pred)}")
    print(f"The precision of the {model_name} model is {precision_score(y_test,y_pred)}")
    print(confusion_matrix(y_test,y_pred))
    print()
    
    

Logistic Regression
The accuaracy of the Logistic Regression model is 0.9572192513368984
The precision of the Logistic Regression model is 0.9763313609467456
[[193   4]
 [ 12 165]]

Random Forest
The accuaracy of the Random Forest model is 0.9518716577540107
The precision of the Random Forest model is 0.9818181818181818
[[194   3]
 [ 15 162]]

Decision Tree
The accuaracy of the Decision Tree model is 0.8796791443850267
The precision of the Decision Tree model is 0.8586956521739131
[[171  26]
 [ 19 158]]

SVC
The accuaracy of the SVC model is 0.946524064171123
The precision of the SVC model is 0.9700598802395209
[[192   5]
 [ 15 162]]

Multinomial Naive Bayes
The accuaracy of the Multinomial Naive Bayes model is 0.9652406417112299
The precision of the Multinomial Naive Bayes model is 0.9767441860465116
[[193   4]
 [  9 168]]



### Based on the performance, Multinomial Naive Bayes is selected as the final model

In [68]:
# Importing joblib for saving the model
from joblib import load,dump

In [69]:
# Selecting the best model (Multinomial Naive Bayes)
model = Trained_model['Multinomial Naive Bayes']

In [70]:
# Saving the final model to a file
dump(model,'Spam_Model.joblib')


['Spam_Model.joblib']

In [71]:
# Saving the Vectorizer

dump(Vectorizer,'Vectorizer.joblib')

['Vectorizer.joblib']