# Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import make_pipeline

Download NLTK resources

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ABC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading the  dataset

In [3]:
data = pd.read_csv(r'C:\Users\ABC\Desktop\All Taskss\code alpha\Task1\dataset\spam.csv')

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.columns

Index(['Category', 'Message'], dtype='object')

# Preprocessing

In [6]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [7]:
def preprocess_text(text):
    words = [stemmer.stem(word) for word in text.split() if word.lower() not in stop_words]
    return ' '.join(words)

In [8]:
data['processed_text'] = data['Message'].apply(preprocess_text)

In [10]:
data

Unnamed: 0,Category,Message,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point, crazy.. avail bugi n great wo..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joke wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor... u c alreadi say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah think goe usf, live around though"
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u. u £750 pound prize. ...
5568,ham,Will ü b going to esplanade fr home?,ü b go esplanad fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...","pity, * mood that. so...ani suggestions?"
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like i'd interest buy someth els...



# Split the data into training and testing sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['Category'], test_size=0.2, random_state=42)


Create a pipeline with TF-IDF vectorizer and Naive Bayes classifier

In [13]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

Train the model


In [14]:
model.fit(X_train, y_train)

Make predictions on the test set



In [15]:
y_pred = model.predict(X_test)

In [16]:
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [17]:
dat = pd.DataFrame({"Original":X_test,"predicted":y_pred})

In [18]:
dat

Unnamed: 0,Original,predicted
3245,squeeeeeze!! christma hug.. u lik frndshp den ...,ham
944,also i'v sorta blown coupl time recent id rath...,ham
1044,mmm that better got roast me! id b better dri...,ham
2484,mm kanji dont eat anyth heavi ok,ham
812,there' ring come guy costumes. gift futur yowi...,ham
...,...,...
4264,den weekday got special price... haiz... cant ...,ham
2439,busi juz dun wan 2 go early.. hee..,ham
5556,ye have. that' u texted. pshew...miss much,ham
4205,enjoy semester? take care brother.,ham


Evaluate the model


In [19]:
accuracy = accuracy_score(y_test, y_pred)

In [20]:
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.97



Display other metrics


In [21]:
print('\nClassification Report:\n', classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [22]:
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))


Confusion Matrix:
 [[966   0]
 [ 34 115]]
