In [1]:
from urllib.request import urlretrieve
import pandas as pd
import numpy as np

# 1. Downlod and Load the Data

In [2]:
urlretrieve("https://raw.githubusercontent.com/codebasics/nlp-tutorials/refs/heads/main/9_bag_of_words/spam.csv", './dataset/emails.csv')

('./dataset/emails.csv', <http.client.HTTPMessage at 0x205c5937d40>)

In [3]:
df = pd.read_csv('./dataset/emails.csv')

# 2. Analyze the dataset

In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.shape

(5572, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
df.groupby('Category').count()

Unnamed: 0_level_0,Message
Category,Unnamed: 1_level_1
ham,4825
spam,747


# 3. Pre-Processing the  categorical data

In [8]:
df['spam'] = df.Category.apply(lambda x: 1 if x == 'spam' else 0)

In [9]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [10]:
x = df['Message']
y = df['spam']

In [11]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: int64

# 4. Split the dataset into training and testing portions

In [12]:
from sklearn.model_selection import train_test_split

  from scipy.sparse import csr_matrix, issparse


In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [14]:
x_train.shape

(4457,)

In [15]:
y_train.shape

(4457,)

In [16]:
y_test.shape

(1115,)

In [17]:
x_test.shape

(1115,)

In [18]:
type(x_test)

pandas.core.series.Series

In [19]:
x_train[0:5]

2965    Do you ever notice that when you're driving, a...
2948                 Leave it. U will always be ignorant.
2810    Oh yeah I forgot. U can only take 2 out shoppi...
4172    Pls what's the full name of joke's school cos ...
2124                                              #ERROR!
Name: Message, dtype: object

In [20]:
x_train

2965    Do you ever notice that when you're driving, a...
2948                 Leave it. U will always be ignorant.
2810    Oh yeah I forgot. U can only take 2 out shoppi...
4172    Pls what's the full name of joke's school cos ...
2124                                              #ERROR!
                              ...                        
2762    I am not sure about night menu. . . I know onl...
2800    I've told him that i've returned it. That shou...
5202             WOT STUDENT DISCOUNT CAN U GET ON BOOKS?
296     T-Mobile customer you may now claim your FREE ...
1430    For sale - arsenal dartboard. Good condition b...
Name: Message, Length: 4457, dtype: object

# 4. Pre-Processing the  message data using Bag Of Words (BOF) method

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
v = CountVectorizer()
x_train_cv = v.fit_transform(x_train.values)
x_train_cv

<4457x7805 sparse matrix of type '<class 'numpy.int64'>'
	with 59513 stored elements in Compressed Sparse Row format>

In [23]:
x_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 7805))

In [24]:
x_train_cv.shape

(4457, 7805)

In [25]:
v.get_feature_names_out()[1000:1100]

array(['ans', 'ansr', 'answer', 'answered', 'answerin', 'answering',
       'answers', 'answr', 'antha', 'anthony', 'anti', 'any', 'anybody',
       'anyhow', 'anymore', 'anyone', 'anyones', 'anyplaces', 'anythiing',
       'anythin', 'anything', 'anytime', 'anyway', 'anyways', 'anywhere',
       'aom', 'apart', 'apartment', 'apes', 'apeshit', 'aphex', 'apnt',
       'apo', 'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appear', 'applausestore', 'applebees', 'apples',
       'application', 'apply', 'applyed', 'applying', 'appointment',
       'appointments', 'appreciate', 'appreciated', 'approaches',
       'approaching', 'appropriate', 'approve', 'approx', 'apps', 'appt',
       'appy', 'april', 'aproach', 'apt', 'aptitude', 'aquarius', 'ar',
       'arab', 'arabian', 'arcade', 'archive', 'ard', 'are', 'area',
       'aren', 'arent', 'arestaurant', 'areyouunique', 'argentina',
       'argh', 'argue', 'arguing', 'argument', 'arguments', 'aries',
       

In [26]:
v.get_feature_names_out().shape

(7805,)

In [None]:
dir(v)

In [71]:
v.vocabulary_

{'do': 2405,
 'you': 7768,
 'ever': 2697,
 'notice': 4883,
 'that': 6873,
 'when': 7545,
 're': 5649,
 'driving': 2485,
 'anyone': 1015,
 'going': 3219,
 'slower': 6303,
 'than': 6861,
 'is': 3810,
 'an': 976,
 'idiot': 3652,
 'and': 980,
 'everyone': 2703,
 'faster': 2816,
 'maniac': 4401,
 'leave': 4117,
 'it': 3822,
 'will': 7584,
 'always': 949,
 'be': 1297,
 'ignorant': 3661,
 'oh': 4965,
 'yeah': 7740,
 'forgot': 2990,
 'can': 1658,
 'only': 5001,
 'take': 6750,
 'out': 5070,
 'shopping': 6179,
 'at': 1139,
 'once': 4993,
 'pls': 5317,
 'what': 7539,
 'the': 6877,
 'full': 3082,
 'name': 4738,
 'of': 4943,
 'joke': 3902,
 'school': 6008,
 'cos': 2042,
 'fees': 2845,
 'in': 3703,
 'university': 7228,
 'florida': 2943,
 'seem': 6060,
 'to': 6995,
 'actually': 825,
 'lt': 4301,
 'gt': 3305,
 'holla': 3520,
 'back': 1215,
 'error': 2662,
 'total': 7057,
 'disappointment': 2371,
 'texted': 6853,
 'was': 7454,
 'craziest': 2084,
 'shit': 6164,
 'got': 3245,
 'ill': 3671,
 'down': 2456,

In [29]:
v.get_feature_names_out()[3617]

'hussey'

In [30]:
x_train_np = x_train_cv.toarray()

In [31]:
x_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7805,))

In [32]:
np.where(x_train_np[0] !=0)

(array([ 976,  980, 1015, 2405, 2485, 2697, 2703, 2816, 3219, 3652, 3810,
        4401, 4883, 5649, 6303, 6861, 6873, 7545, 7768]),)

In [33]:
 for k in np.where(x_train_np[0] !=0):
     print(v.get_feature_names_out()[k])
     

['an' 'and' 'anyone' 'do' 'driving' 'ever' 'everyone' 'faster' 'going'
 'idiot' 'is' 'maniac' 'notice' 're' 'slower' 'than' 'that' 'when' 'you']


In [34]:
x_train[:4]

2965    Do you ever notice that when you're driving, a...
2948                 Leave it. U will always be ignorant.
2810    Oh yeah I forgot. U can only take 2 out shoppi...
4172    Pls what's the full name of joke's school cos ...
Name: Message, dtype: object

# 5. Create the first model using Naive Bayes algorithm

In [35]:
from sklearn.naive_bayes import MultinomialNB

In [65]:
mn = MultinomialNB()
mn.fit(x_train_cv,y_train)

In [37]:

x_test_cv = v.transform(x_test)

In [38]:
x_test_cv.shape

(1115, 7805)

# 5.1 Evaluate the model

In [39]:
mn.score(x_train_cv,y_train)

0.9923715503702042

In [40]:
y_pred = mn.predict(x_test_cv)

In [41]:
mn.score(x_test_cv, y_test)

0.9847533632286996

In [42]:
from sklearn.metrics import confusion_matrix, classification_report

In [43]:
confusion_matrix(y_pred,y_test)

array([[974,  14],
       [  3, 124]])

In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       977
           1       0.98      0.90      0.94       138

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# 5.2 Creating an NLP Pipe Line

In [45]:
from sklearn.pipeline import Pipeline

In [46]:
model2 = Pipeline(
    [
        ("vectorizer", CountVectorizer()),
        ("MultinomialNB", MultinomialNB())
    ]
)

In [47]:
model2.fit(x_train,y_train)

In [49]:
x_test

2038                             Oh sorry please its over
5358    Hmm. Shall i bring a bottle of wine to keep us...
2485    Only if you promise your getting out as SOON a...
249     It didnt work again oh. Ok goodnight then. I.l...
5450                              Sac needs to carry on:)
                              ...                        
2329        That day you asked about anand number. Why:-)
1132                               Sorry, I'll call later
4607    Oh... Haha... Den we shld had went today too.....
4030               […] anyway, many good evenings to u! s
1137    Dont forget you can place as many FREE Request...
Name: Message, Length: 1115, dtype: object

In [50]:
pred2 = model2.predict(x_test)

In [51]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       988
           1       0.90      0.98      0.94       127

    accuracy                           0.98      1115
   macro avg       0.95      0.98      0.96      1115
weighted avg       0.99      0.98      0.99      1115



# 6. Create a model using Decision Tree Classifier

In [57]:
from sklearn.tree import DecisionTreeClassifier

In [58]:
model3 = Pipeline(
    [
        ("vectorizer", CountVectorizer()),
        (" DecisionTreeClassifier",  DecisionTreeClassifier())
    ]
)

In [59]:

model3.fit(x_train,y_train)

# 6.1 Evaluate the model

In [60]:
pred3 = model3.predict(x_test)

In [62]:
model3.score(x_train,y_train)

1.0

In [63]:
model3.score(x_test, y_test)

0.9650224215246637

In [64]:
print(classification_report(pred3, y_test))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       984
           1       0.83      0.88      0.86       131

    accuracy                           0.97      1115
   macro avg       0.91      0.93      0.92      1115
weighted avg       0.97      0.97      0.97      1115

