<a href="https://colab.research.google.com/github/Muhammad-Azham-Oman/AI_Portfolio/blob/main/ham_spam_Email.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score

In [2]:
data = pd.read_csv('spam.csv')

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
data['Spam'] =  data['Category'].apply(lambda x: 1 if x=='spam' else 0)
data.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
X = data.Message
y = data.Spam

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [8]:
X_test

Unnamed: 0,Message
3245,Squeeeeeze!! This is christmas hug.. If u lik ...
944,And also I've sorta blown him off a couple tim...
1044,Mmm thats better now i got a roast down me! i...
2484,Mm have some kanji dont eat anything heavy ok
812,So there's a ring that comes with the guys cos...
...,...
4264,Den only weekdays got special price... Haiz......
2439,I not busy juz dun wan 2 go so early.. Hee..
5556,Yes i have. So that's why u texted. Pshew...mi...
4205,How are you enjoying this semester? Take care ...


In [9]:
v = CountVectorizer()
X_train_counts = v.fit_transform(X_train.values)
X_train_counts.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
model = MultinomialNB()

In [11]:
model.fit(X_train_counts,y_train)

In [12]:
model.score(X_train_counts,y_train)

0.9932690150325331

In [13]:
y_pred = model.predict(v.transform(X_test))

In [14]:
model.score(v.transform(X_test),y_test)

0.9919282511210762

In [15]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [16]:
confusion_matrix(y_test,y_pred)

array([[966,   0],
       [  9, 140]])

In [17]:
accuracy_score(y_test,y_pred)

0.9919282511210762

### Pipeline setup

In [18]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [19]:
clf.fit(X_train,y_train)

In [20]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
clf.score(X_test,y_test)

0.9919282511210762