In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('Datasets/spam.csv', encoding = "ISO-8859-1")

In [6]:
df.shape

(5572, 5)

In [7]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
df.tail

<bound method NDFrame.tail of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  

In [10]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [11]:
df.isna().sum().sum()

16648

In [12]:
#frequency of ham and spam
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [13]:
df['spam'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0


In [14]:
x_train, x_test, y_train, y_test = train_test_split(df['v2'], df['spam'], test_size=.20)

print("X_train size: ", x_train.shape)
print("Y_train size: ", y_train.shape)
print("X_test size: ", x_test.shape)
print("Y_test size: ", y_test.shape)

X_train size:  (4457,)
Y_train size:  (4457,)
X_test size:  (1115,)
Y_test size:  (1115,)


In [15]:
v = CountVectorizer()
x_train_updated = v.fit_transform(x_train.values)
x_train_updated.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
model = MultinomialNB()
model.fit(x_train_updated, y_train)

In [17]:
x_test_vectorized = v.transform(x_test)

y_pred = model.predict(x_test_vectorized)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")

Accuracy: 0.99


In [18]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       982
           1       0.98      0.92      0.95       133

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [20]:
text1 = input("Enter SMS : ")
text2 = input("Enter another TEXT : ")
data = [text1, text2]

data = v.transform(data)
model.predict(data)

Enter SMS :  hello i am new to coding
Enter another TEXT :  working on python project


array([0, 0], dtype=int64)