In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = '../All_data_sets/nlp/spam.csv'

In [3]:
df = pd.read_csv( path , encoding='iso-8859-1' , names=['target' , 'text','0','1','2'] )
df.head()

Unnamed: 0,target,text,0,1,2
0,v1,v2,,,
1,ham,"Go until jurong point, crazy.. Available only ...",,,
2,ham,Ok lar... Joking wif u oni...,,,
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
4,ham,U dun say so early hor... U c already then say...,,,


In [4]:
drop_columns = ['0', '1', '2']

In [5]:
df.drop(columns=drop_columns,inplace=True)

In [6]:
df.head()

Unnamed: 0,target,text
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...


In [7]:
df = df.iloc[ 1: , 0: ]

df

Unnamed: 0,target,text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


In [8]:
import nltk
import re

# remove the stop words
from nltk.corpus import stopwords

# reduce the words to its root word
from nltk.stem import WordNetLemmatizer , PorterStemmer

In [9]:
lemma = WordNetLemmatizer()
ps = PorterStemmer()
stop_words = stopwords.words('english')

In [10]:
def preprocessing( message ):

    # take only the words
    review = re.sub( '[^a-zA-Z]' , ' ' , message )

    # lower case the words
    review = review.lower()

    # split the words into list
    review = review.split()

    # apply stop words and stemming
    review = [lemma.lemmatize(word) for word in review if not word in set(stop_words)]

    # join the words to form the sentence
    review = ' '.join(review)
    
    # print(review)
    
    return review

In [11]:
df['cleaned_text'] = df['text'].apply( lambda x:preprocessing(x) )

In [12]:
df.head()

Unnamed: 0,target,text,cleaned_text
1,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
2,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
4,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
5,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Create bag of words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500,ngram_range=(1,2))

In [15]:
X = cv.fit_transform( df['cleaned_text'] ).toarray()

In [16]:
cv.vocabulary_

{'go': 799,
 'point': 1602,
 'crazy': 451,
 'available': 117,
 'bugis': 229,
 'great': 854,
 'world': 2437,
 'la': 1079,
 'cine': 345,
 'got': 838,
 'wat': 2341,
 'ok': 1467,
 'lar': 1090,
 'joking': 1046,
 'wif': 2398,
 'free': 727,
 'entry': 613,
 'wkly': 2421,
 'comp': 405,
 'win': 2403,
 'cup': 459,
 'final': 683,
 'st': 1961,
 'may': 1258,
 'text': 2082,
 'receive': 1696,
 'question': 1657,
 'std': 1976,
 'txt': 2199,
 'rate': 1670,
 'apply': 79,
 'free entry': 732,
 'entry wkly': 615,
 'std txt': 1977,
 'txt rate': 2206,
 'rate apply': 1671,
 'dun': 567,
 'say': 1790,
 'early': 577,
 'already': 50,
 'nah': 1382,
 'think': 2102,
 'usf': 2275,
 'life': 1136,
 'around': 92,
 'though': 2110,
 'freemsg': 739,
 'hey': 918,
 'darling': 477,
 'week': 2367,
 'word': 2430,
 'back': 136,
 'like': 1141,
 'fun': 758,
 'still': 1979,
 'tb': 2056,
 'xxx': 2467,
 'send': 1824,
 'rcv': 1677,
 'even': 623,
 'brother': 221,
 'speak': 1943,
 'treat': 2171,
 'per': 1535,
 'request': 1732,
 'melle': 1

In [17]:
y = df['target']

In [23]:
df.head()

Unnamed: 0,target,text,cleaned_text
1,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
2,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
4,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
5,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though


In [27]:
y_target = pd.get_dummies( df['target'] ,drop_first=True , dtype=int)

y_target

Unnamed: 0,spam
1,0
2,0
3,1
4,0
5,0
...,...
5568,1
5569,0
5570,0
5571,0


In [30]:
y_target = y_target.iloc[ : , 0: ].values

In [32]:
from sklearn.model_selection import train_test_split 

In [33]:
X_train , X_test , y_train , y_test = train_test_split( X,y,test_size=0.25,random_state=43 )

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [38]:
spam_detect_model = MultinomialNB( )

In [39]:
spam_detect_model.fit(X_train , y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [41]:
y_pred = spam_detect_model.predict( X_test  )

In [42]:
from sklearn.metrics import accuracy_score , classification_report

In [43]:
accuracy_score( y_test , y_pred )

0.9806173725771715

In [45]:
print( classification_report( y_test , y_pred ) )

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1202
        spam       0.94      0.92      0.93       191

    accuracy                           0.98      1393
   macro avg       0.96      0.96      0.96      1393
weighted avg       0.98      0.98      0.98      1393



## Tf-Idf

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2500 , ngram_range=(1,2))

In [57]:
X = tfidf.fit_transform( df['cleaned_text'] ).toarray()

In [49]:
tfidf.vocabulary_

{'go': 799,
 'point': 1602,
 'crazy': 451,
 'available': 117,
 'bugis': 229,
 'great': 854,
 'world': 2437,
 'la': 1079,
 'cine': 345,
 'got': 838,
 'wat': 2341,
 'ok': 1467,
 'lar': 1090,
 'joking': 1046,
 'wif': 2398,
 'free': 727,
 'entry': 613,
 'wkly': 2421,
 'comp': 405,
 'win': 2403,
 'cup': 459,
 'final': 683,
 'st': 1961,
 'may': 1258,
 'text': 2082,
 'receive': 1696,
 'question': 1657,
 'std': 1976,
 'txt': 2199,
 'rate': 1670,
 'apply': 79,
 'free entry': 732,
 'entry wkly': 615,
 'std txt': 1977,
 'txt rate': 2206,
 'rate apply': 1671,
 'dun': 567,
 'say': 1790,
 'early': 577,
 'already': 50,
 'nah': 1382,
 'think': 2102,
 'usf': 2275,
 'life': 1136,
 'around': 92,
 'though': 2110,
 'freemsg': 739,
 'hey': 918,
 'darling': 477,
 'week': 2367,
 'word': 2430,
 'back': 136,
 'like': 1141,
 'fun': 758,
 'still': 1979,
 'tb': 2056,
 'xxx': 2467,
 'send': 1824,
 'rcv': 1677,
 'even': 623,
 'brother': 221,
 'speak': 1943,
 'treat': 2171,
 'per': 1535,
 'request': 1732,
 'melle': 1

In [58]:
X_train , X_test , y_train , y_test = train_test_split( X,y,test_size=0.25,random_state=43 )

In [59]:
spam_detect_model = MultinomialNB( )

In [60]:
spam_detect_model.fit( X_train,y_train )

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [61]:
y_pred = spam_detect_model.predict( X_test )

In [62]:
accuracy_score( y_test , y_pred )

0.9727207465900933

In [63]:
print( classification_report( y_test , y_pred ) )

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1202
        spam       0.98      0.82      0.89       191

    accuracy                           0.97      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.97      0.97      0.97      1393

