In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NAGENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NAGENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df = pd.read_csv(r'E:\MLOps\Spam-Classifier-endtoend-ML-pipeline\spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [11]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df.duplicated().sum(), len(df)

(np.int64(403), 5572)

In [17]:
df = df.drop_duplicates(keep='first')
len(df)

5169

In [18]:
from nltk.stem.porter import PorterStemmer
import string

ptr = PorterStemmer()

In [20]:
def transform_text(text):
    text = text.lower()

    text =  nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ptr.stem(i))

    return " ".join(y)

In [23]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [24]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfid = TfidfVectorizer(max_features=500)

In [26]:
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [27]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier
)
from xgboost import XGBClassifier

In [30]:
svc = SVC(kernel='sigmoid',gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear',penalty='l1')
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
abc = AdaBoostClassifier(n_estimators=50,random_state=2)
bc = BaggingClassifier(n_estimators=50,random_state=2)
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
gdbt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [31]:
from sklearn.metrics import accuracy_score,precision_score

def train_classifier(classifier,X_train,y_train,X_test,y_test):
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)

    return accuracy,precision

In [32]:
classifiers = {
    'svc':svc,
    'knc':knc,
    'mnb':mnb,
    'dtc':dtc,
    'lrc':lrc,
    'rfc':rfc,
    'abc':abc,
    'bc':bc,
    'etc':etc,
    'gdbt':gdbt,
    'xgb':xgb
}

accuracy_scores = []
precision_scores = []

for name, classifier in classifiers.items():
    accuracy, precision = train_classifier(classifier,X_train,y_train,X_test,y_test)
    print(f'For: {name}\nAccuracy: {accuracy}\nPrecision: {precision}')

For: svc
Accuracy: 0.9671179883945842
Precision: 0.9333333333333333
For: knc
Accuracy: 0.9274661508704062
Precision: 1.0
For: mnb
Accuracy: 0.9709864603481625
Precision: 0.9655172413793104
For: dtc
Accuracy: 0.937137330754352
Precision: 0.9010989010989011




For: lrc
Accuracy: 0.9632495164410058
Precision: 0.9629629629629629
For: rfc
Accuracy: 0.9700193423597679
Precision: 0.9421487603305785
For: abc
Accuracy: 0.9235976789168279
Precision: 0.8734177215189873
For: bc
Accuracy: 0.9622823984526112
Precision: 0.9024390243902439
For: etc
Accuracy: 0.9709864603481625
Precision: 0.921875
For: gdbt
Accuracy: 0.9497098646034816
Precision: 0.93
For: xgb
Accuracy: 0.9690522243713733
Precision: 0.9568965517241379


In [34]:
df = pd.read_csv('https://raw.githubusercontent.com/Nag28endra/datasets/refs/heads/main/spam.csv')