In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('spam.csv')
df[20:]

Unnamed: 0,Category,Message
20,ham,Is that seriously how you spell his name?
21,ham,I‘m going to try for 2 months ha ha only joking
22,ham,So ü pay first lar... Then when is da stock co...
23,ham,Aft i finish my lunch then i go str down lor. ...
24,ham,Ffffffffff. Alright no way I can meet up with ...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['spam'] = df['Category'].apply(lambda x:1 if x == 'spam' else 0)

In [6]:
df[20:]

Unnamed: 0,Category,Message,spam
20,ham,Is that seriously how you spell his name?,0
21,ham,I‘m going to try for 2 months ha ha only joking,0
22,ham,So ü pay first lar... Then when is da stock co...,0
23,ham,Aft i finish my lunch then i go str down lor. ...,0
24,ham,Ffffffffff. Alright no way I can meet up with ...,0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [7]:
cv = CountVectorizer()

In [8]:
x_train,x_test,y_train,y_test = train_test_split(df.Message,df.spam,test_size = 0.2, random_state = 69)

In [9]:
x_train[:10][3552]

'I love you both too :-)'

In [10]:
x_train_cv = cv.fit_transform(x_train)

In [11]:
model = MultinomialNB()
model.fit(x_train_cv,y_train)

In [12]:
model

In [13]:
x_test_cv = cv.transform(x_test)
score = model.predict(x_test_cv)

In [14]:
report = classification_report(y_test,score)

In [15]:
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.98      0.92      0.95       151

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [16]:
emails = ["You’ve just won a 20 % discount ",
         "ok ok ,i'm not that brave uk!!!!i'm texting her instead!!"]

In [17]:
emails_cv = cv.transform(emails)
emails_cv

<2x7748 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [18]:
model.predict(emails_cv)

array([1, 0])

Shorter way to make the work done

In [19]:
from sklearn.pipeline import Pipeline

In [20]:
pipeline = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('model',MultinomialNB())
])

In [21]:
model_pipeline = pipeline.fit(x_train,y_train)

In [22]:
score = model_pipeline.predict(x_test)

In [23]:
model_pipeline_score = classification_report(y_test,score)

In [24]:
print(model_pipeline_score)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.98      0.92      0.95       151

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [27]:
model_pipeline.predict(emails)

array([1, 0])