In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud #for text visualization

import nltk #for natural language processing 
from nltk.corpus import stopwords

# dowload nltk data files
nltk.download('stopwords') #for stopwords
nltk.download('punkt') #for tokenization

[nltk_data] Downloading package stopwords to C:\Users\Gagan
[nltk_data]     Poojari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Gagan
[nltk_data]     Poojari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv('spam.csv')
df.head() #show the first 5 rows of the dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True) #drop unnecessary columns
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.rename(columns = {'v1':'label', 'v2':'message'}, inplace = True) #rename columns
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# DATA PREPROCESSING

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label']) #encode labels as 0 and 1
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.duplicated().sum() #check for duplicates

403

In [9]:
len(df)

5572

In [10]:
df = df.drop_duplicates(keep = 'first') #drop duplicates
len(df)

5169

In [11]:
# FEATURE ENGINEERING

from nltk.stem.porter import PorterStemmer 

from nltk.stem.porter import PorterStemmer #for stemming the words.
# Stemming is a process of reducing words to their root form, so that words with same root can be grouped together.
# PorterStemmer is a widely used stemmer in NLP tasks.

import string #for string manipulation

ps = PorterStemmer()


In [12]:
# Function to transform text (remove punctuation, stopwords, and stem the words)
def transform_text(text): 
    text = text.lower()
    
    text = nltk.word_tokenize(text) #tokenize the text into words

    text = [word for word in text if word.isalnum()] #remove special characters
    text = [word for word in text if word not in stopwords.words('english') and word not in string.punctuation] #remove stopwords and punctuation
    text = [ps.stem(word) for word in text] #stem the words

    text = ' '.join(text) #join the words into a single string
    return text


In [13]:
df['transformed_message'] = df['message'].apply(transform_text) #apply the function to the 'message' column
df.head()

Unnamed: 0,label,message,transformed_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #to convert text data into numerical feature vectors for machine learning models
# CountVectorizer:
#     - Converts a collection of text documents into a matrix of token counts.
#     - Each row in the resulting matrix corresponds to a document, and each column corresponds to a unique word (token) in the corpus.
#     - The value in the matrix represents the count of a word in a document
# TfidfVectorizer:
#     - Converts a collection of text documents into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features.
#     - TF-IDF gives importance to words based on their frequency in a document and their rarity across all documents in the corpus.
#     - It helps reduce the weight of common words (e.g., "the", "is") and emphasizes rare, meaningful words.
tfid = TfidfVectorizer(max_features = 500)

X = tfid.fit_transform(df['transformed_message']).toarray() #convert the transformed text into numerical feature vectors
y = df['label'].values 

In [15]:
# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)


In [24]:
# MODEL TRAINING
from sklearn.svm import SVC #SVC is a support vector machine classifier that can be used for both linear and non-linear classification tasks.
from sklearn.linear_model import LogisticRegression #LogisticRegression is a linear model for binary classification that uses the logistic function to model the probability of a class label.
from sklearn.naive_bayes import MultinomialNB #MultinomialNB is a Naive Bayes classifier for multivariate Bernoulli models.
from sklearn.tree import DecisionTreeClassifier  #DecisionTreeClassifier is a decision tree classifier that can be used for both classification and regression tasks.
from sklearn.neighbors import KNeighborsClassifier #KNeighborsClassifier is a classification algorithm that uses the k-nearest neighbors approach.
from sklearn.ensemble import RandomForestClassifier #RandomForestClassifier is an ensemble learning method that constructs a multitude of decision trees and outputs the mode of their classes.
from sklearn.ensemble import AdaBoostClassifier #AdaBoostClassifier is an ensemble learning method that combines multiple weak classifiers to create a strong classifier.
from sklearn.ensemble import BaggingClassifier #BaggingClassifier is an ensemble learning method that combines multiple base classifiers to improve the overall performance.
from sklearn.ensemble import ExtraTreesClassifier #ExtraTreesClassifier is an ensemble learning method that constructs multiple decision trees and outputs the mode of their classes.
from sklearn.ensemble import GradientBoostingClassifier #GradientBoostingClassifier is an ensemble learning method that builds a model in a stage-wise fashion and generalizes the boosting framework.
from xgboost import XGBClassifier #XGBClassifier is an optimized gradient boosting classifier that is efficient and scalable.

svc = SVC(kernel='sigmoid', gamma=0.1)
lr = LogisticRegression(solver='liblinear', penalty='l1') #solver is the algorithm to use in the optimization problem, penalty is the type of regularization to apply.
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
knc = KNeighborsClassifier()
rfc = RandomForestClassifier(n_estimators=50, random_state=2) #n_estimators is the number of trees in the forest.
abc = AdaBoostClassifier(n_estimators=50, random_state=2) #n_estimators is the number of weak classifiers to combine.
bc = BaggingClassifier(n_estimators=50, random_state=2) #n_estimators is the number of base classifiers to use.
etc = ExtraTreesClassifier(n_estimators=50, random_state=2) #n_estimators is the number of trees in the forest.
gbc = GradientBoostingClassifier(n_estimators=50, random_state=2) #n_estimators is the number of weak learners in the ensemble.
xgb = XGBClassifier(n_estimators=50, random_state=2) #n_estimators is the number of trees in the ensemble.

models = {
    'Support Vector Machine': svc,
    'Logistic Regression': lr,
    'Multinomial Naive Bayes': mnb,
    'Decision Tree': dtc,
    'K-Nearest Neighbors': knc,
    'Random Forest': rfc,
    'AdaBoost': abc,
    'Bagging': bc,
    'Extra Trees': etc,
    'Gradient Boosting': gbc,
    'XGBoost': xgb
}



In [25]:
# MODEL EVALUATION
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(models, X_train, y_train, X_test, y_test):
    models.fit(X_train, y_train)
    y_pred = models.predict(X_test)
    models_accuracy = accuracy_score(y_test, y_pred)
    models_precision = precision_score(y_test, y_pred)
    return models_accuracy, models_precision

them_accuracy_scores = []
them_precision_scores = []

for model_name, models in models.items():
    da_model_accuracy, da_model_precision = train_classifier(models, X_train, y_train, X_test, y_test)
    print(f"\n{model_name}:\nAccuracy: {da_model_accuracy:.4f}\nPrecision: {da_model_precision:.4f}")
    them_accuracy_scores.append(da_model_accuracy)
    them_precision_scores.append(da_model_precision)


Support Vector Machine:
Accuracy: 0.9584
Precision: 0.9798

Logistic Regression:
Accuracy: 0.9632
Precision: 0.9630

Multinomial Naive Bayes:
Accuracy: 0.9710
Precision: 0.9655

Decision Tree:
Accuracy: 0.9362
Precision: 0.9000

K-Nearest Neighbors:
Accuracy: 0.9275
Precision: 1.0000

Random Forest:
Accuracy: 0.9720
Precision: 0.9431





AdaBoost:
Accuracy: 0.9613
Precision: 0.9375

Bagging:
Accuracy: 0.9652
Precision: 0.9180

Extra Trees:
Accuracy: 0.9729
Precision: 0.9297

Gradient Boosting:
Accuracy: 0.9507
Precision: 0.9394

XGBoost:
Accuracy: 0.9700
Precision: 0.9573
