<a href="https://colab.research.google.com/github/Hirusha99/nlp/blob/main/Bag_of_word_Spam_Email.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts() # can get a value counts

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam,test_size=0.2)


In [8]:
X_train.shape

(4457,)

In [9]:
 # apply Bag of words -> we create count vectorizer using sklearn

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7765 sparse matrix of type '<class 'numpy.int64'>'
	with 59392 stored elements in Compressed Sparse Row format>

In [10]:
X_train_cv.shape

(4457, 7765)

In [11]:
v.get_feature_names_out().shape

(7765,)

In [13]:
v.vocabulary_

{'can': 1633,
 'you': 7727,
 'plz': 5291,
 'tell': 6789,
 'me': 4428,
 'the': 6850,
 'ans': 979,
 'bslvyl': 1544,
 'sent': 6056,
 'via': 7305,
 'fullonsms': 3040,
 'com': 1902,
 'ok': 4941,
 'then': 6861,
 'whats': 7499,
 'ur': 7233,
 'todays': 6976,
 'plan': 5260,
 'oh': 4936,
 'yeah': 7699,
 'clearly': 1841,
 'it': 3769,
 'my': 4687,
 'fault': 2793,
 'or': 5003,
 'ill': 3618,
 'be': 1274,
 'little': 4170,
 'closer': 1852,
 'like': 4132,
 'at': 1112,
 'bus': 1579,
 'stop': 6531,
 'on': 4962,
 'same': 5929,
 'street': 6550,
 'bored': 1444,
 'housewives': 3520,
 'chat': 1744,
 'date': 2178,
 'now': 4866,
 '0871750': 127,
 '77': 615,
 '11': 264,
 'bt': 1548,
 'national': 4728,
 'rate': 5601,
 '10p': 262,
 'min': 4504,
 'only': 4973,
 'from': 3020,
 'landlines': 4025,
 'fine': 2858,
 'do': 2378,
 'remember': 5715,
 'hai': 3301,
 'ana': 956,
 'tomarrow': 6991,
 'am': 931,
 'coming': 1913,
 'morning': 4610,
 'lt': 4268,
 'decimal': 2209,
 'gt': 3262,
 'there': 6867,
 'in': 3648,
 'sathy': 5

In [14]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
np.where(X_train_np[0] != 0)

(array([ 979, 1544, 1633, 1902, 3040, 4428, 5291, 6056, 6789, 6850, 7305,
        7727]),)

In [17]:
X_train[:4]

3882    Can you plz tell me the ans. BSLVYL sent via f...
5389                 Ok.ok ok..then..whats ur todays plan
5275                        Oh yeah clearly it's my fault
237     Or ill be a little closer like at the bus stop...
Name: Message, dtype: object

In [18]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [19]:
X_test_cv = v.transform(X_test)

In [20]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       973
           1       0.97      0.96      0.96       142

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [21]:
from sklearn.pipeline import Pipeline

clf = Pipeline(
    [('vectorizer',CountVectorizer()),
     ('nb',MultinomialNB())]
)

In [22]:
clf.fit(X_train,y_train)

In [23]:
y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       973
           1       0.97      0.96      0.96       142

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115

