In [142]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [143]:
df = pd.read_csv('spam.csv', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [144]:
df = df[['v1', 'v2']]

In [145]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [146]:
df.shape

(5572, 2)

In [147]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [148]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [150]:
X = df['v2']
y = df['v1']

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [152]:
X_train

184                            Going on nothing great.bye
2171                        I wont. So wat's wit the guys
5422              Ok k..sry i knw 2 siva..tats y i askd..
4113    Where are you ? What do you do ? How can you s...
4588         Have you not finished work yet or something?
                              ...                        
1932                            Jus finished avatar nigro
5316                         Jus finish watching tv... U?
2308    Moby Pub Quiz.Win a å£100 High Street prize if...
1903    Free entry in 2 a weekly comp for a chance to ...
763     Nothing but we jus tot u would ask cos u ba gu...
Name: v2, Length: 4457, dtype: object

In [153]:
y_train

184      ham
2171     ham
5422     ham
4113     ham
4588     ham
        ... 
1932     ham
5316     ham
2308    spam
1903    spam
763      ham
Name: v1, Length: 4457, dtype: object

In [154]:
le = LabelEncoder()

y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

In [155]:
y_train_le

array([0, 0, 0, ..., 1, 1, 0])

In [156]:
print("Class Labels:", le.classes_) 
print("Encoded Labels:", y_train_le) 

Class Labels: ['ham' 'spam']
Encoded Labels: [0 0 0 ... 1 1 0]


In [157]:
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('model', LogisticRegression())
])

pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('model', SVC())
])

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('model', MultinomialNB())
])

In [158]:
pipelines = {
    'Logistic Regression': pipeline_lr,
    'Support Vector Machine': pipeline_svc,
    'Naive Bayes': pipeline_nb
}

In [159]:
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"{name} Accuracy:", accuracy_score(y_test, y_pred))
    print(f"{name} Classification Report:\n:", classification_report(y_test, y_pred))
    print(f"{name} Confusion Matrix:\n:", confusion_matrix(y_test, y_pred))


Logistic Regression Accuracy: 0.968609865470852
Logistic Regression Classification Report:
:               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Logistic Regression Confusion Matrix:
: [[965   1]
 [ 34 115]]
Support Vector Machine Accuracy: 0.9775784753363229
Support Vector Machine Classification Report:
:               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Support Vector Machine Confusion Matrix:
: [[964   2]
 [ 23 126]]
Naive Bayes Accuracy: 0.