In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv("/content/spam.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#convert the category column to numerical column
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [None]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text



In [6]:
# apply the above function to the message column
df['Message'] = df['Message'].apply(preprocess_text)

In [7]:
df['Message']

Unnamed: 0,Message
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in 2 a wkly comp to win fa cup fina...
3,u dun say so early hor u c already then say
4,nah i dont think he goes to usf he lives aroun...
...,...
5567,this is the 2nd time we have tried 2 contact u...
5568,will ü b going to esplanade fr home
5569,pity was in mood for that soany other suggest...
5570,the guy did some bitching but i acted like id ...


In [None]:
df.shape

(5572, 2)

In [8]:
# Convert text to numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Message'])
y = df['Category']

In [9]:
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df.head())  # Display the first 5 rows


   008704050406  0089my  0121  01223585236  01223585334  0125698789   02  \
0           0.0     0.0   0.0          0.0          0.0         0.0  0.0   
1           0.0     0.0   0.0          0.0          0.0         0.0  0.0   
2           0.0     0.0   0.0          0.0          0.0         0.0  0.0   
3           0.0     0.0   0.0          0.0          0.0         0.0  0.0   
4           0.0     0.0   0.0          0.0          0.0         0.0  0.0   

   020603  0207  02070836089  ...  zeros  zhong  zindgi  zoe  zogtorius  zoom  \
0     0.0   0.0          0.0  ...    0.0    0.0     0.0  0.0        0.0   0.0   
1     0.0   0.0          0.0  ...    0.0    0.0     0.0  0.0        0.0   0.0   
2     0.0   0.0          0.0  ...    0.0    0.0     0.0  0.0        0.0   0.0   
3     0.0   0.0          0.0  ...    0.0    0.0     0.0  0.0        0.0   0.0   
4     0.0   0.0          0.0  ...    0.0    0.0     0.0  0.0        0.0   0.0   

   zouk  zyada  üll  〨ud  
0   0.0    0.0  0.0  0.0  
1 

In [10]:

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train Naïve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = model.predict(X_test)

In [13]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)


In [14]:
# Display results
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.96


In [15]:

# Test with a new message
sample_message = ["Congratulations! You've won a free iPhone. Click here to claim."]
sample_message = vectorizer.transform(sample_message)
prediction = model.predict(sample_message)
print("Spam" if prediction[0] else "Not Spam")


Spam


In [None]:
prediction

array([1])

In [16]:
# Test with a new message
sample_message = [" Please join the meeting at 4pm"]
sample_message = vectorizer.transform(sample_message)
prediction = model.predict(sample_message)
print("Spam" if prediction[0] else "Not Spam")

Not Spam
