Project: 1 Email Spam Detection

Problem Statement:
Classify emails as spam or not using textual features.

In [107]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [108]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

df = pd.read_csv('data.csv')


In [109]:
df.sample(10)


Unnamed: 0.1,Unnamed: 0,Body,Label
3716,3716,use Perl Daily Headline MailerPassing the Parr...,0
4798,4798,"On Tue, Aug 13, 2002 at 12:22:14PM +0100, Nial...",0
2048,2048,>>From the BBC website - www.bbc.co.uk Tuesday...,0
908,908,empty,1
4681,4681,Thought this was funny.\nIn the slashdot threa...,0
9,9,"Dear ricardo1 ,\nCOST EFFECTIVE Direct Email A...",1
191,191,empty,1
566,566,I am a TV producer for one of the three major ...,1
720,720,empty,1
587,587,\nAre your tired of 9 to 5? \nLet us show you ...,1


In [110]:
df.columns = ['unnamed', 'text', 'is_spam']


In [111]:
df.sample(10)


Unnamed: 0,unnamed,text,is_spam
1481,1481,Government Grants E-Book 2002 editionkatfish48...,1
2239,2239,Justin Mason writes:\n> Has anyone figured out...,0
971,971,>From the ethnobotanical herbalists who brough...,1
625,625,e Earn\n $1500 Or More Per Week!\n ...,1
4544,4544,Liam Bedford wrote:\n> that is the CVS version...,0
1419,1419,"CashIC.com ï¿½nternetten para kazandiran, en o...",1
4717,4717,has anyone had a problem with Yast2 not being ...,0
993,993,"Learn How To Make $8,000 within 7-14 days!Get ...",1
668,668,empty,1
3961,3961,"URL: http://www.newsisfree.com/click/-0,836576...",0


In [112]:
df.drop(columns=['unnamed'], inplace=True) 


In [113]:
df.sample(10)


Unnamed: 0,text,is_spam
4672,\nBAD MSG: > And you get a working version of ...,0
1556,Below is the result of your feedback form. It...,1
4158,"URL: http://www.newsisfree.com/click/-4,851800...",0
3814,URL: http://scriptingnews.userland.com/backiss...,0
5543,On 21 Jul 2002 14:20:42 +1200\nMark Derricutt ...,0
3439,\nCraig Hughes said:> > - All headers are re...,0
2597,"LOL you rool (:On Sat, 21 Sep 2002, Gregory A...",0
1673,Shoot your wad all over her face.\nThese Girls...,1
2287,">>>>> On Sat, 24 Aug 2002, ""Harlan"" == Harlan ...",0
4310,"URL: http://www.newsisfree.com/click/-3,870111...",0


In [114]:
df.isna().sum()


text       1
is_spam    0
dtype: int64

In [115]:
df.dropna(inplace=True)


In [None]:
df.shape


(6045, 2)

In [169]:
X, y = df['text'], df['is_spam']


In [170]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [178]:

import re
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_text(text_series):
    return text_series.apply(lambda text: ' '.join(
        stemmer.stem(word) for word in re.findall(r'\b\w+\b', text.lower())
    ))


In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier


In [None]:
model = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])


In [None]:
model.fit(X_train, y_train)


In [None]:
model.score(X_test, y_test)


0.9536807278742763

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'Support Vector Machine' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'ExtraTreeClassifier' : ExtraTreeClassifier(),
    'SVC' : SVC(),
    'Multinomial Naive Bayes' : MultinomialNB(),
    'Bernoulli Naive Bayes' : BernoulliNB(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    
}


for name, current_model in models.items():
    print(f"Checking accuracy for {name}")
    model = Pipeline([
        ('preprocess', FunctionTransformer(convert_text, validate=False)),
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
        ('clf', current_model)
    ])

    model.fit(X_train, y_train)
    print(f"Accuracy for {name} is {model.score(X_test, y_test)}")
    print("*"*50)
    print("\n")



Checking accuracy for Multinomial Naive Bayes
Accuracy for Multinomial Naive Bayes is 0.9528535980148883
**************************************************


Checking accuracy for Bernoulli Naive Bayes
Accuracy for Bernoulli Naive Bayes is 0.9586435070306039
**************************************************


Checking accuracy for AdaBoost




Accuracy for AdaBoost is 0.9387923904052936
**************************************************


Checking accuracy for Gradient Boosting
Accuracy for Gradient Boosting is 0.9346567411083541
**************************************************




In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer()),
    ('clf', BernoulliNB())
])

# Define parameter grid
param_grid = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__min_df': [1, 3, 5],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [5000, 8000, 10000],
    'clf__alpha': [0.1, 0.5, 1.0],  # Laplace smoothing
    'clf__binarize': [0.0, 0.5, 1.0],
    'clf__fit_prior': [True, False]
}

# Run GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

# Best score and params
print("Best accuracy:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)
