In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline

In [2]:
data =  pd.read_csv("spam.csv")
data.drop(columns = "Unnamed: 0", inplace =True)

In [3]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.to_csv("spam.csv")

In [5]:
# import regular expression and nltk
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kunda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kunda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kunda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kunda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Data Preprocessing 
Steps:\
    1. Remove Urls \
    2. Remove Numbers and Punctuations (all irrelevant characters) \
    3. Covert all characters in lower case \
    4. Tokenization \
    5. Removing Stopwords \
    6. Removing all character having less than equal to 2 characters \
    7. Lemmatization 
    
    

In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#stopwords.words('english')

In [7]:
stop_word = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

def clean_text(message):
    message = re.sub(r"https\S+",' ', message)
    message = re.sub('[^a-zA-Z]', ' ', message)
    message = str(message).lower()
    message = word_tokenize(message)
    message = [i for i in message if i not in stop_word]
    message = [i for i in message if len(i) > 2]
    message = [lemma.lemmatize(word=w, pos = 'v') for w in message]
    message = ' '.join(message)
    return message

data['clean_message'] = data['message'].apply(clean_text)


In [8]:
data.head()

Unnamed: 0,label,message,clean_message
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...
1,ham,Ok lar... Joking wif u oni...,lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,ham,U dun say so early hor... U c already then say...,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live around though


# Word Embedding
1. Bag of Word
2. TF-IDF 
3. Word to Vec

In [9]:
# Bag of Word
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(data['clean_message']).toarray()

In [10]:
X[:2].shape

(2, 3000)

In [11]:
y = pd.get_dummies(data['label'])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.naive_bayes import MultinomialNB
spam_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_model.predict(X_test)

y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [13]:
from sklearn.metrics import accuracy_score,classification_report

score=accuracy_score(y_test,y_pred)
print("Accuracy Score: ", score)

from sklearn.metrics import classification_report
print("Calssification Report: \n" , classification_report(y_pred,y_test))

Accuracy Score:  0.9820627802690582
Calssification Report: 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       960
           1       0.95      0.92      0.93       155

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

