# Message Classifier 

### The goal of that projects is to make a 'ham' or 'spam' message classifier based on a data **spam.csv**

### Import functions

In [104]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lee4reeal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Read the data and see it

In [105]:
file = open('spam.csv' , 'r', encoding="ISO-8859-1")
df = pd.read_csv(file)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### drop the usless columns

In [106]:
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Renaming the columns

In [107]:
df.rename( columns={'v1':'Label','v2':'Message' }, inplace=True)
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [108]:
df['Label'] = df['Label'].map({'ham':0, 'spam':1})

In [109]:
df.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### get the shape of data

In [110]:
print(df.shape)

(5572, 2)


### check for duplicates and then remove

In [111]:
df.drop_duplicates(inplace=True)

### get the shape of data

In [112]:
df.shape

(5169, 2)

In [113]:
# see the number of the None data
df.isna().sum()

Label      0
Message    0
dtype: int64

In [114]:
def process_text(text):
    #1 remove punctuation
    #2 remove stopwords
    #3 return a list of clean text words

    #1
    nopunc = [ char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    #2
    clean_words = [ word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

    #3
    return clean_words

In [115]:
df['Message'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Message, dtype: object

### convert a collection of message to a matrix of token counts 

In [116]:
message_bow = CountVectorizer(analyzer=process_text).fit_transform(df['Message'])

In [118]:
# split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(message_bow, df['Label'] , test_size=0.20 ,random_state=0 )

In [119]:
#Get the shape of message_bow
message_bow.shape

(5169, 11304)

In [120]:
#create and train the Naives bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [121]:
#Print the predictions
print(classifier.predict(X_train))

#Print the actual values
print(y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [122]:
#Evalute the model on training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train , pred))
print()
print('Confusion Matrix: \n', confusion_matrix(y_train , pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3631
           1       0.98      0.98      0.98       504

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135


Confusion Matrix: 
 [[3623    8]
 [  11  493]]

Accuracy:  0.9954050785973397


In [123]:
#Print the predictions
print(classifier.predict(X_test))

#Print the actual values
print(y_test.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [124]:
#Evalute the model on training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test , pred))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test , pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       885
           1       0.80      0.93      0.86       149

    accuracy                           0.96      1034
   macro avg       0.89      0.94      0.92      1034
weighted avg       0.96      0.96      0.96      1034


Confusion Matrix: 
 [[850  35]
 [ 11 138]]

Accuracy:  0.9555125725338491
