In [82]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

raw_df = pd.read_csv('C:\\Users\\kp\\Pictures\\Assignments\\Naive Bayes\\sms_raw_NB.csv',header=0,encoding='latin-1')
raw_df.columns=['Type','Text']
raw_df

Unnamed: 0,Type,Text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [83]:
raw_df['Text'] = raw_df['Text'].astype('str')
raw_df['Type'] = raw_df['Type'].astype('str')

raw_df = raw_df.fillna(0)
raw_df.drop(raw_df[raw_df['Text'] == 0].index, axis=0, inplace=True)
raw_df.drop(raw_df[raw_df['Type'] == 0].index, axis=0, inplace=True)
raw_df.drop_duplicates(subset='Text', keep='first', inplace=True)

raw_df=raw_df.reset_index(drop=True)

raw_df.Text = raw_df.Text.str.lower()

raw_df['Text'] = raw_df['Text'].str.replace("'s'", "")

raw_df["Text"] = raw_df['Text'].apply(lambda record: word_tokenize(record))
stop_words = set(stopwords.words("english"))
raw_df['Text'] = raw_df['Text'].apply(lambda record: [word for word in record if word not in stop_words])
    
def apply_lemmatization(string_list):

    lem = WordNetLemmatizer()
    list = []

    for word in string_list:
        list.append(lem.lemmatize(word, "v"))

    return list

raw_df['Text'] = raw_df['Text'].apply(apply_lemmatization)

raw_df.Text = raw_df.Text.apply(lambda record: " ".join(record))

raw_df["Text"] = raw_df['Text'].apply(lambda x: re.sub('[^A-Za-z" "]+', "", x))

class_codes = {'ham': 1, 'spam': -1}
raw_df['Class_Code'] = raw_df['Type']
raw_df = raw_df.replace({'Class_Code': class_codes})
cleaned_df=raw_df
cleaned_df

Unnamed: 0,Type,Text,Class_Code
0,ham,hope good week check,1
1,ham,kgive back thank,1
2,ham,also cbe pay,1
3,spam,complimentary star ibiza holiday cash need u...,-1
4,spam,okmail dear dave final notice collect teneri...,-1
...,...,...,...
5151,ham,great role model give much really wish day mi...,1
5152,ham,awesome remember last time get somebody high ...,1
5153,spam,nt prize go another customer c wwwtcbiz pm...,-1
5154,spam,sms ac jsco energy high u may know channel ...,-1


In [84]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_df['Text'],
                                                    cleaned_df['Class_Code'], test_size=0.15,random_state=8)

tfidf = TfidfVectorizer(encoding='utf-8',
                                ngram_range=(1, 2),
                                stop_words=None,
                                lowercase=False,
                                max_df=1.0,
                                min_df=10,
                                max_features=700,
                                norm='l2',
                                sublinear_tf=True)
features_train = tfidf.fit_transform(X_train).toarray()
class_labels_train = y_train

features_test = tfidf.transform(X_test).toarray()
class_labels_test = y_test

In [85]:
gaussian_nbc = GaussianNB()
pred_gaussian = gaussian_nbc.fit(features_train,class_labels_train).predict(features_test)
pred_gaussian_train=gaussian_nbc.predict(features_train)
confusion_matrix(y_test,pred_gaussian)
print ("Test Accuracy of Gaussian Naive Bayes model accuracy(in %):", accuracy_score(y_test, pred_gaussian)*100)
print ("Train Accuracy of Gaussian Naive Bayes model accuracy(in %):", accuracy_score(y_train, pred_gaussian_train)*100)

Test Accuracy of Gaussian Naive Bayes model accuracy(in %): 71.57622739018088
Train Accuracy of Gaussian Naive Bayes model accuracy(in %): 75.03423094477407


In [86]:
multinomial_nbc = MultinomialNB()
pred_multinomial = multinomial_nbc.fit(features_train,class_labels_train).predict(features_test)
pred_multinomial_train=multinomial_nbc.predict(features_train)
confusion_matrix(y_test,pred_multinomial)
print ("Test Accuracy of MultiNomial Naive Bayes model accuracy(in %):", accuracy_score(y_test, pred_multinomial)*100)
print ("Train Accuracy of MultiNomial Naive Bayes model accuracy(in %):", accuracy_score(y_train, pred_multinomial_train)*100)
#MultiNomial Naive Bayes Classifier is prebable Classifier when the features Vectors are Documents or for Documents Classificatio Problem

Test Accuracy of MultiNomial Naive Bayes model accuracy(in %): 97.1576227390181
Train Accuracy of MultiNomial Naive Bayes model accuracy(in %): 97.740757644911
