In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score

from imblearn.over_sampling import RandomOverSampler

# Contents

1. Exploring Data

2. Data preprocessing
* Data Cleaning
* Tokenization
* Removal of stopwords
* Stemming

4. Vectorisation using TF-IDF

3. Model Building: 4 Models used:
* Naive Bayes
* Random Forest
* KNeighbors
* SVM


4. Model Evaluation 
* Classification Reports
* Evaluation Metrics

In [None]:
#Dataset
data=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding="latin-1")
data

# **Exploring Data**

In [None]:
data.info()
data.columns

In [None]:
#Drop irrelevant columns and rename columns
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1, inplace=True)
data.rename(columns={'v1':'Target','v2':'Text'}, inplace=True)

In [None]:
data.head()

In [None]:
data.groupby("Target").describe()

In [None]:
#palette
cols=["#05A8AA","#EF233C"]
plt.figure(figsize=(12,8))
fg=sns.countplot(x=data["Target"], palette=cols)
fg.set_title("Count of Spam and Ham")
fg.set_xlabel("Classes")
fg.set_ylabel("Count")

In [None]:
data["Text_Length"]=data["Text"].apply(len)

In [None]:
data.head()

In [None]:
sns.pairplot(data=data,hue='Target', palette=cols)


Lengthy messages are more likely to be ham.

In [None]:
sns.boxplot(data["Text_Length"])

In [None]:
print(data[data["Text_Length"]>450].count())

In [None]:
#remove outliers
data=data[data["Text_Length"]<450]

# **Data Preprocessing**
**1. Data Cleaning**
* Extract only the alphabetic characters(to remove punctuation and numbers)
* Convert all the characters into lowercase.

In [None]:
print(data["Text"][:10])

In [None]:
def clean(text):
    sms= re.sub('[^a-zA-Z]', " ", text)
    sms=sms.lower()
    
    #remove extra whitespaces
    sms= sms.split()
    sms= ' '.join(sms)
    return sms

data["Cleaned_text"]=data["Text"].apply(clean)
    

In [None]:
print(data["Cleaned_text"][:10])



**2. Tokenization**

Tokenization is breaking complex data into smaller units called tokens. It can be done by splitting paragraphs into sentences and sentences into words.

In [None]:
data["Tokenized_text"]=data.apply(lambda row: nltk.word_tokenize(row["Cleaned_text"]), axis=1)

In [None]:
print(data["Tokenized_text"][:10])

**3. Remove Stopwords**

Stopwords are frequently occurring words(such as few, is, an, etc). These words hold meaning in sentence structure, but do not contribute much to language processing in NLP. 

In [None]:
def remove_stopwords(text):
    stop_words=set(stopwords.words("english"))
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

data["Nostopword_text"] = data["Tokenized_text"].apply(remove_stopwords)

In [None]:
print(data["Nostopword_text"][:10])

**4. Stemming**

The process of reducing words to their root or base form, typically by removing suffixes or prefixes, with the aim of capturing the core meaning of words.

In [None]:
nltk.download('wordnet')

In [None]:
stemmer = PorterStemmer()
def stem_word(text):
    stems = [stemmer.stem(word) for word in text]
    return stems
data["Stemmed_text"] = data["Nostopword_text"].apply(stem_word)

In [None]:
print(data["Stemmed_text"][:10])

# Vectorization

TF-IDF in NLP stands for Term Frequency – Inverse document frequency. In NLP cleaned data needs to be converted into a numerical format where each word is represented by a matrix. This is also known as word embedding or Word vectorization.

Steps in the Vectorizing:
* Creating a corpus of lemmatized text
* Converting the corpus in vector form
* Label Encoding the classes in Target


In [None]:
corpus= []
for i in data["Stemmed_text"]:
    msg = ' '.join([row for row in i])
    corpus.append(msg)
    
corpus[:5]

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()
X.dtype

In [None]:
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])

# Class Imbalance Handling

To perform oversampling, undersampling, or other balancing techniques to address class imbalance.

In [None]:
y = data["Target"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)


In [None]:
oversampler=RandomOverSampler(random_state=11)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [None]:
resampled_df = pd.DataFrame({'Target': y_train})

plt.figure(figsize=(8, 6))
sns.countplot(x='Target', data=resampled_df, palette=cols)
plt.title('Class Distribution After Oversampling')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# **Model Building**

In [None]:
classifiers = [MultinomialNB(), 
               RandomForestClassifier(),
               KNeighborsClassifier(), 
               SVC()]
for cls in classifiers:
    cls.fit(X_train, y_train)

pipe_dict = {0: "NaiveBayes", 1: "RandomForest", 2: "KNeighbours",3: "SVC"}

In [None]:
for i, model in enumerate(classifiers):
    cv_score = cross_val_score(model, X_train,y_train,scoring="accuracy", cv=10)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

# Model Evaluation

In [None]:
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
sensitivity_list = []
specificity_list = []
fpr_list = []
fnr_list = []
npv_list = []
fdr_list = []
mcc_list = []

for cls in classifiers:
    
    y_pred = cls.predict(X_test)
    
    # Confusion Matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(f'Confusion Matrix of {cls}')
    plt.show()
    
    
    print(classification_report(y_test, y_pred))
    
    
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (fn + tp)
    f1 = 2 * (precision * recall) / (precision + recall)
    sensitivity = tp / (fn + tp)
    specificity = tn / (tn + fp)
    fpr = fp / (tn + fp)
    fnr = fn / (fn + tp)
    npv = tn / (tn + fn)
    fdr = fp / (fp + tp)
    mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    sensitivity_list.append(sensitivity)
    specificity_list.append(specificity)
    fpr_list.append(fpr)
    fnr_list.append(fnr)
    npv_list.append(npv)
    fdr_list.append(fdr)
    mcc_list.append(mcc)

metrics_df = pd.DataFrame({
    'Model': [pipe_dict[i] for i in range(len(pipe_dict))],
    'Accuracy': accuracy_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1-Measure': f1_list,
    'Sensitivity': sensitivity_list,
    'Specificity': specificity_list,
    'FPR': fpr_list,
    'FNR': fnr_list,
    'NPV': npv_list,
    'FDR': fdr_list,
    'MCC': mcc_list
})


In [None]:
metrics_df.transpose()